Skip to content

Commit

Permalink
Merge pull request #120 from opensafely/iaindillingham/refactoring
Browse files Browse the repository at this point in the history
Refactoring
  • Loading branch information
iaindillingham authored Dec 15, 2023
2 parents 9b37e23 + 9bcbce6 commit d3e826c
Show file tree
Hide file tree
Showing 12 changed files with 124 additions and 142 deletions.
7 changes: 7 additions & 0 deletions analysis/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
import pathlib

WORKSPACE_DIR = pathlib.Path(__file__).parents[1]

ANALYSIS_DIR = WORKSPACE_DIR / "analysis"

OUTPUT_DIR = WORKSPACE_DIR / "output"
30 changes: 25 additions & 5 deletions analysis/aggregate.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,21 @@
"""Aggregate event counts and apply Statistical Disclosure Control (SDC) functions.
For more information, see:
https://docs.opensafely.org/releasing-files/
"""
import pandas

from analysis import sdc, utils
from analysis import OUTPUT_DIR, utils

SUPPRESSION_THRESHOLD = 7
ROUNDING_MULTIPLE = 5


def main():
d_in = utils.OUTPUT_DIR / "query"
d_in = OUTPUT_DIR / "query"
event_counts = read(d_in / "rows.csv.gz")

d_out = utils.OUTPUT_DIR / "aggregate"
d_out = OUTPUT_DIR / "aggregate"
utils.makedirs(d_out)
aggregate(event_counts, "D", "sum").to_csv(d_out / "sum_by_day.csv")
aggregate(event_counts, "W", "mean").to_csv(d_out / "mean_by_week.csv")
Expand Down Expand Up @@ -49,12 +55,26 @@ def aggregate(event_counts, offset, func):
event_counts.pipe(resample, offset, func)
.round() # to nearest integer
.astype(int)
.pipe(sdc.redact_le_seven)
.pipe(sdc.round_to_nearest_five)
.pipe(redact_le, SUPPRESSION_THRESHOLD)
.pipe(round_to_nearest, ROUNDING_MULTIPLE)
.unstack(level=group_by)
)


def redact_le(series, threshold):
copy_of_series = series.copy(deep=True)
copy_of_series[copy_of_series <= threshold] = 0
return copy_of_series


def round_to_nearest(series, multiple):
def rounder(value):
assert isinstance(value, int), f"The value to round ({value}) must be an int"
return int(multiple * round(value / multiple, 0))

return series.apply(rounder)


def resample(event_counts, offset, func):
"""Resamples an irregular time series to a fixed frequency time series.
Expand Down
26 changes: 0 additions & 26 deletions analysis/click_types.py

This file was deleted.

5 changes: 2 additions & 3 deletions analysis/generate_dummy_rows.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,7 @@
import pandas
from numpy import random

from analysis import utils

from analysis import ANALYSIS_DIR, utils

rng = random.default_rng(seed=1)

Expand All @@ -27,7 +26,7 @@ def main():
("SGSS_Negative", "2020-01-01", "2022-11-17"),
("SGSS_Positive", "2020-01-03", "2022-01-29"),
]
f_out = utils.ANALYSIS_DIR / "dummy_rows.csv.gz"
f_out = ANALYSIS_DIR / "dummy_rows.csv.gz"

utils.makedirs(f_out.parent)
data_frame = make_dummy_rows(tables)
Expand Down
50 changes: 45 additions & 5 deletions analysis/plot.py
Original file line number Diff line number Diff line change
@@ -1,30 +1,55 @@
"""Plot aggregated event counts.
"""
import collections
import pathlib
import re
import textwrap
import unicodedata

import click
import pandas
from matplotlib import pyplot

from analysis import click_types, utils
from analysis import OUTPUT_DIR, utils


class ClickTimestamp(click.ParamType):
"""The Timestamp type converts date strings into pandas.Timestamp objects."""

name = "Timestamp"

def convert(self, value, param, ctx):
return pandas.Timestamp.fromisoformat(value)


class ClickPath(click.Path):
"""The Path type converts path strings into pathlib.Path objects.
This conversion is supported by Click>=8.0.
"""

name = "Path"

def convert(self, value, param, ctx):
path = super().convert(value, param, ctx)
return pathlib.Path(path)


@click.command()
@click.option("--from-date", type=click_types.Timestamp())
@click.option("--from-date", type=ClickTimestamp())
@click.option("--from-offset", type=int)
@click.option(
"--output",
"d_out",
type=click_types.Path(file_okay=False, resolve_path=True),
type=ClickPath(file_okay=False, resolve_path=True),
required=True,
)
def main(from_date, from_offset, d_out):
# Click doesn't support option groups (search for click-option-group on PyPI for
# why), so this ensures that at least one from_* option is set.
assert (from_date is not None) or (from_offset is not None)

d_in = utils.OUTPUT_DIR / "aggregate"
d_in = OUTPUT_DIR / "aggregate"
by_day = read(d_in / "sum_by_day.csv")
by_week = read(d_in / "mean_by_week.csv")

Expand All @@ -42,7 +67,7 @@ def main(from_date, from_offset, d_out):

figs_cols = plot(by_day, by_week, get_plot_title(from_date, from_offset))
for fig, col in figs_cols:
f_stem = utils.slugify(col)
f_stem = slugify(col)
fig.savefig(d_out / f"{f_stem}.png")


Expand Down Expand Up @@ -113,5 +138,20 @@ def plot(by_day, by_week, plot_title):
yield fig, col


def slugify(s):
# Based on Django's slugify. For more information, see:
# https://github.com/django/django/blob/4.1.7/django/utils/text.py#L399-L417

# convert to ASCII
s = unicodedata.normalize("NFKD", s).encode("ascii", "ignore").decode("ascii")
# remove characters that are not word, white space, or dash
s = re.sub(r"[^\w\s-]", "", s)
# replace one or more dash or one or more white space with one dash
s = re.sub(r"[-\s]+", "-", s)
# remove leading and trailing dashes and underscores
s = s.strip("-_")
return s.lower()


if __name__ == "__main__":
main()
31 changes: 24 additions & 7 deletions analysis/render_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,20 +6,22 @@
import base64
import collections
import datetime
import json
import mimetypes

import dateutil.parser
from jinja2 import Environment, FileSystemLoader, StrictUndefined

from analysis import utils
from analysis import ANALYSIS_DIR, OUTPUT_DIR, utils

ENVIRONMENT = Environment(
loader=FileSystemLoader(utils.ANALYSIS_DIR),
loader=FileSystemLoader(ANALYSIS_DIR),
undefined=StrictUndefined,
)


def main():
f_out = utils.OUTPUT_DIR / "render_report" / "report.html"
f_out = OUTPUT_DIR / "render_report" / "report.html"
utils.makedirs(f_out.parent)
rendered_report = render_report(
{
Expand All @@ -30,15 +32,15 @@ def main():
# It's passed as a template variable so that we can format it consistently
# with other template variables.
"tpp_epoch_date": datetime.date(2009, 1, 1),
"run_date": utils.get_run_date(),
"run_date": get_run_date(),
"from_date": {
"plot_from_2020": datetime.date(2020, 2, 1),
"plot_from_2016": datetime.date(2016, 1, 1),
},
"plots": group_plots(
utils.OUTPUT_DIR / "plot_from_last_30_days",
utils.OUTPUT_DIR / "plot_from_2020",
utils.OUTPUT_DIR / "plot_from_2016",
OUTPUT_DIR / "plot_from_last_30_days",
OUTPUT_DIR / "plot_from_2020",
OUTPUT_DIR / "plot_from_2016",
),
}
)
Expand Down Expand Up @@ -67,6 +69,21 @@ def render_report(data):
return template.render(data)


def get_log():
return [
json.loads(line)
for line in (OUTPUT_DIR / "query" / "log.json").read_text().splitlines()
]


def get_run_date():
by_event = {d["event"]: d for d in get_log()}
timestamp = by_event.get("finish_executing_sql_query", {}).get(
"timestamp", "9999-01-01T00:00:00"
)
return dateutil.parser.parse(timestamp)


def group_plots(*paths, suffix=".png"):
"""Groups similarly named plots.
Expand Down
26 changes: 0 additions & 26 deletions analysis/sdc.py

This file was deleted.

42 changes: 0 additions & 42 deletions analysis/utils.py
Original file line number Diff line number Diff line change
@@ -1,51 +1,9 @@
import functools
import json
import os
import pathlib
import re
import unicodedata

import dateutil.parser

WORKSPACE_DIR = pathlib.Path(__file__).parents[1]

ANALYSIS_DIR = WORKSPACE_DIR / "analysis"

OUTPUT_DIR = WORKSPACE_DIR / "output"

makedirs = functools.partial(os.makedirs, exist_ok=True)


def slugify(s):
# Based on Django's slugify. For more information, see:
# https://github.com/django/django/blob/4.1.7/django/utils/text.py#L399-L417

# convert to ASCII
s = unicodedata.normalize("NFKD", s).encode("ascii", "ignore").decode("ascii")
# remove characters that are not word, white space, or dash
s = re.sub(r"[^\w\s-]", "", s)
# replace one or more dash or one or more white space with one dash
s = re.sub(r"[-\s]+", "-", s)
# remove leading and trailing dashes and underscores
s = s.strip("-_")
return s.lower()


def date_format(date):
"""Formats the given date as, for example, "1 January 2023"."""
return f"{date:%-d %B %Y}" # the - removes the leading zero, but not on Windows


def get_log():
return [
json.loads(line)
for line in (OUTPUT_DIR / "query" / "log.json").read_text().splitlines()
]


def get_run_date():
by_event = {d["event"]: d for d in get_log()}
timestamp = by_event.get("finish_executing_sql_query", {}).get(
"timestamp", "9999-01-01T00:00:00"
)
return dateutil.parser.parse(timestamp)
16 changes: 16 additions & 0 deletions tests/test_aggregate.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,3 +51,19 @@ def test_aggregate_mean_by_week():
columns=pandas.Index(["table_1"], name="table_name"),
),
)


@pytest.mark.parametrize("data_in,data_out", [(6, 0), (7, 0), (8, 8)])
def test_redact_le(data_in, data_out):
series = pandas.Series(data_in)
redacted_series = aggregate.redact_le(series, aggregate.SUPPRESSION_THRESHOLD)
assert series is not redacted_series
assert list(redacted_series) == [data_out]


@pytest.mark.parametrize("data_in,data_out", [(1, 0), (3, 5), (5, 5), (7, 5), (9, 10)])
def test_round_to_nearest(data_in, data_out):
series = pandas.Series(data_in)
rounded_series = aggregate.round_to_nearest(series, aggregate.ROUNDING_MULTIPLE)
assert series is not rounded_series
assert list(rounded_series) == [data_out]
5 changes: 5 additions & 0 deletions tests/test_plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,3 +60,8 @@ def test_get_date_ranges_from_offset(by_day):

with pytest.raises(StopIteration):
date_range = next(date_ranges)


@pytest.mark.parametrize("string,slug", [("Ça va?", "ca-va"), ("_so--so_", "so-so")])
def test_slugify(string, slug):
assert plot.slugify(string) == slug
20 changes: 0 additions & 20 deletions tests/test_sdc.py

This file was deleted.

Loading

0 comments on commit d3e826c

Please sign in to comment.