Skip to content

Commit

Permalink
Merge pull request #12 from opensafely/weekly-update-mar-23
Browse files Browse the repository at this point in the history
Weekly update mar 23
  • Loading branch information
brianmackenna authored Mar 24, 2021
2 parents d2dab22 + 4783380 commit e6acb0c
Show file tree
Hide file tree
Showing 38 changed files with 33,837 additions and 21,686 deletions.
9 changes: 5 additions & 4 deletions lib/create_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
# we create a dict for renaming population variables into suitable longer/correctly capitalised forms for presentation as titles
variable_renaming = {'ageband': "Age band",
'ageband 5yr': "Age band",
'ageband_5yr': "Age band",
'sex': "Sex",
'bmi':"BMI",
'ethnicity 6 groups':"Ethnicity (broad categories)",
Expand Down Expand Up @@ -117,8 +118,8 @@ def find_and_sort_filenames(foldername, *,
]
sort_order = {key: ix for ix, key in enumerate(ordered_dems)}
elif by_demographics_or_population=="population":
ordered_pops = ['80+', '70-79', 'care home', 'shielding (aged 16-69)', '65-69',
'under 65s, not in other eligible groups shown']
ordered_pops = ['80+', '70-79', 'care home', 'shielding (aged 16-69)', '65-69', 'LD (aged 16-64)', '60-64', '55-59', '50-54',
'under 60s, not in other eligible groups shown']
sort_order = {key: ix for ix, key in enumerate(ordered_pops)}
else:
display("sort_by_population_or_demographics received an invalid value")
Expand Down Expand Up @@ -260,11 +261,11 @@ def show_table(filename, latest_date_fmt, *, org_breakdown=None, show_carehomes=
display(Markdown(f"- Population includes those known to live in an elderly care home, based upon clinical coding."))
elif "shielding" in title:
display(Markdown(f"- Population excludes those over 65 known to live in an elderly care home, based upon clinical coding."))
elif ("80+" in title) | ("70-79" in title) | ("65-69" in title):
elif ("80+" in title) | ("70-79" in title) | ("65-69" in title): # don't include under 65s here
display(Markdown(f"- Population excludes those known to live in an elderly care home, based upon clinical coding."))

# display note that 65-69 and LD group excludes shielding subgroup
if ("65-69" in title) | ("LD (aged 16-64)" in title):
if ("65-69" in title) | ("60-64" in title) | ("55-59" in title) | ("50-54" in title) | ("LD (aged 16-64)" in title):
display(Markdown(f"- Population excludes those who are currently shielding."))


Expand Down
39 changes: 20 additions & 19 deletions lib/data_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,27 +80,30 @@ def load_data(input_file='input_delivery.csv', input_path="output"):
df = df.assign(bmi = np.where((df["bmi"]=="Not obese"), "under 30", "30+"))

# drop unnecssary columns or columns created for processing
df = df.drop(["imd","ethnicity_16", "ethnicity", "adrenaline_pen", "has_died", "has_follow_up"], 1)



# shielding: keep flag only for under 70s
df["shielded"] = np.where((df["shielded"]==1) & (df["age"]<70), "shielding (aged 16-69)", "")
df = df.drop(["imd","ethnicity_16", "ethnicity", 'ethnicity_6_sus',
'ethnicity_16_sus', "adrenaline_pen", "has_died", "has_follow_up"], 1)

# categorise into priority groups (similar to the national groups but not exactly the same)
conditions = [
(df["care_home"]==1) & (df["age"]>=65),
(df["age"]>=80),
(df["age"]>=70),
(df["shielded"]==1),
(df["age"]>=65),
(df["LD"]==1),
(df["age"]>=60),
(df["age"]>=55),
(df["age"]>=50)]
choices = [3,1,2,4,5,6,7,8,9]
# note the numbers here denote the desired sort order in which we want to look at these groups, not the priority order

###### care homes #####
# amend community age band to remove any care home flags for under 65s
df.loc[(df["ageband_community"]=="care home") & (df["age"]<60), "ageband_community"] = df["ageband"] # 10 yr age band
df.loc[(df["ageband_community"]=="care home") & (df["age"]>=60) & (df["age"]<65), "ageband_community"] = "60-64" # 5 yr age band

# amend community age band to remove any people shielding from the under 70s groups (they will be reported in shielded group)
df.loc[(df["ageband_community"]!="care home") & (df["age"]<70) & (df["shielded"]=="shielding (aged 16-69)"), "ageband_community"] = "shielding (aged 16-69)"
# create field "priority_group" which uses the appropriate value from `choices` according to the `conditions` met by each line of data. If none are met, assigns 0.
# Eg. for patient aged 71 but not in a care home, patient does not meet the first or second criteria, but meets the third so is assigned to the third of the `choices` i.e. `2`.
df['priority_group'] = np.select(conditions, choices, default=0)

# rename column for clarity
df = df.rename(columns={"shielded_since_feb_15":"newly_shielded_since_feb_15"})

# LD: keep flag only for under 65s and those not shielding
df["LD_group"] = np.where((df["LD"]==1) & (df["age"]<65) &(df["shielded"]==""), "LD (aged 16-64)", "")

# for each specific situation or condition, replace 1 with YES and 0 with no. This makes the graphs easier to read
for c in ["LD", "newly_shielded_since_feb_15", "dementia",
"chronic_cardiac_disease", "current_copd", "dialysis", "dmards","psychosis_schiz_bipolar",
Expand All @@ -110,15 +113,13 @@ def load_data(input_file='input_delivery.csv', input_path="output"):
"temporary_immunosuppression", "asplenia"]:
df[c] = np.where(df[c]==1, "yes", "no")

# rename columns for agebands for consistency
df = df.rename(columns={"ageband_community":"community_ageband"})

# get total population sizes and names for each STP
stps = pd.read_csv(os.path.join("..","lib","stp_dict.csv"), usecols=["stp_id","name","list_size_o80"])
df = df.merge(stps, left_on="stp", right_on="stp_id", how="left").rename(columns={"name":"stp_name"})

# drop additional columns
df = df.drop(["age","stp_id"], 1)
df = df.drop(['registered', 'care_home', 'age',"stp_id", "ageband_community"], 1)

return df

Expand Down
33 changes: 13 additions & 20 deletions lib/data_quality.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
# import custom functions from 'lib' folder
import sys
sys.path.append('../lib/')
from report_results import create_output_dirs, filter_other_group, round7
from report_results import create_output_dirs, round7

def ethnicity_completeness(df, groups_of_interest):

Expand All @@ -16,33 +16,24 @@ def ethnicity_completeness(df, groups_of_interest):
Inputs:
df (dataframe): processed patient-level dataframe containing "ethnicity_6_groups" column,
as well as "community_ageband" (for filtering to given group) and "patient_id" (for counting)
group (str): group of interest e.g. "80+"
name_of_other_group (str): name to give for the general population who are vaccinated but do not have recorded eligibility factors
groups_not_in_other_group (list): groups to exclude from "other" group, i.e. all currently included criteria for eligibility
as well as "group"&""group_name" (to identify vaccine priority group) and "patient_id" (for counting)
groups_of_interest (dict): dict mapping names of population/eligible subgroups to integers (1-9, and 0 for "other")
Outputs:
displays string describing n and % of given group with ethnicity known
'''
# create copy of df only with cols of interest
cols = [v for v in set(groups_of_interest.values()) if v != "other"]
cols.extend(["ethnicity_6_groups","patient_id"])

cols = ["group", "group_name", "ethnicity_6_groups","patient_id"]

ethnicity_coverage = pd.DataFrame(columns=["group", "n with ethnicity", "total population (n)", "ethnicity coverage (%)"])

for i, (group_title, group_label) in enumerate(groups_of_interest.items()):

for i, (groupname, groupno) in enumerate(groups_of_interest.items()):
out = df[cols].copy()
# filter dataframe to eligible group
if group_label == "other": # for "all others" filter out the each of the defined groups
# we want to exclude all the other eligible groups from the "other" group
out = filter_other_group(out, groups_of_interest=groups_of_interest)
elif group_label != "community_ageband": # for groups not defined as age bands or care home, filter out care home population
out = out.loc[(out["community_ageband"]!="care home") & (out[group_label]==group_title)]
# will need a further filter for the "clinically vulnerable" group here
else: # age groups / care home
out = out.loc[(out[group_label]==group_title)]

out = out.loc[(out["group_name"]==groupname)]

total = round7(out["patient_id"].nunique())

known_eth = out.groupby("ethnicity_6_groups")[["patient_id"]].nunique().reset_index()
Expand All @@ -52,11 +43,13 @@ def ethnicity_completeness(df, groups_of_interest):

# export ethnicity coverage stats to text file
savepath, _, _ = create_output_dirs()
ethnicity_coverage.loc[i] = [group_title, known_eth, total, percent]
if groupno == 0: # for the other group the denominator is unknown and only vaccinated people are included
groupname ="vaccinated "+groupname
ethnicity_coverage.loc[i] = [groupname, known_eth, total, percent]
ethnicity_coverage.to_csv(os.path.join(savepath["text"], "ethnicity_coverage.csv"), index=False)


display(Markdown(f"Total **{group_title}** population with ethnicity recorded {known_eth:,d} ({percent}%)"))
display(Markdown(f"Total **{groupname}** population with ethnicity recorded {known_eth:,d} ({percent}%)"))



Expand Down
76 changes: 26 additions & 50 deletions lib/report_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,37 +80,23 @@ def find_and_save_latest_date(df, savepath, reference_column_name="covid_vacc_da
return latest_date, latest_date_fmt


def remove_other_key(d):
'''
Mutate a dictionary to remove key which has value "other"
def filtering(d):
'''
Find items from the full set of single digit numbers (0-9) which are not present as values in a given dict
keys = list(d.keys())
d2 = d.copy()
for key, val in d.items():
if val == "other":
del d2[key]
return d2


def filter_other_group(df, groups_of_interest):
'''
Exclude all the other eligible groups from the "other" group in dataframe provided. "other" should be contained in the groups_of_interest.
Inputs:
d (dict): a dict mapping strings to numeric `values`
Args:
df (dataframe): input data
groups_of_interest (dict): subgroups to breakdown by (and column on which to filter for these groups)
Outputs:
l (list): a list containing zero and all the single digit numbers (0-9) which do not appear in d
Returns:
df (dataframe)
'''
groups_to_exclude = groups_of_interest.copy()

groups_to_exclude = remove_other_key(groups_to_exclude)
for group_to_exclude, filter_col in groups_to_exclude.items():
df = df.loc[(~df[filter_col].isin([group_to_exclude]))]

return df
all_keys = [0,1,2,3,4,5,6,7,8,9]
keys = list(d.values())

# check which of `all_keys` are absent in `keys` and return them as a list (but always include 0)
l = [k for k in all_keys if ((k not in keys)|(k==0))]
return l


def cumulative_sums(df, groups_of_interest, features_dict, latest_date):
Expand All @@ -119,7 +105,7 @@ def cumulative_sums(df, groups_of_interest, features_dict, latest_date):
Args:
df (dataframe): input data
groups_of_interest (dict): subgroups to breakdown by (and column on which to filter for these groups)
groups_of_interest (dict): dict mapping names of population/eligible subgroups to integers (1-9, and 0 for "other")
features_dict (dict): dictionary mapping population subgroups to a list of demographic/clinical factors to include for that group
latest_date (str): "YYYY-MM-DD"
Expand All @@ -134,18 +120,16 @@ def cumulative_sums(df, groups_of_interest, features_dict, latest_date):
# also selects columns of interest. For example, in care home, we are interested in
# sex, ageband and broad ethnicity groups. In the analysis of age bands we are interested
# in much more detail such as comorbidies and ethnicity in 16 groups.
for group_title, group_label in groups_of_interest.items():

# filter dataframe to eligible group
if group_label == "other": # for "all others" filter out the each of the defined groups
out = df.copy()
# we want to exclude all the other eligible groups from the "other" group
out = filter_other_group(out, groups_of_interest=groups_of_interest)
elif group_label != "community_ageband": # for groups not defined as age bands or care home, filter out care home population
out = df.copy().loc[(df["community_ageband"]!="care home") & (df[group_label]==group_title)]
# will need a further filter for the "clinically vulnerable" group here
else: # age groups / care home
out = df.copy().loc[(df[group_label]==group_title)]
# make a new field for the priority groups we are looking at (where any we have not specifically listed are regrouped as 0/"other")
items_to_group = filtering(groups_of_interest)
df["group"] = np.where(df["priority_group"].isin(items_to_group), 0, df["priority_group"])
# translate number into name
for name, number in groups_of_interest.items():
df.loc[df["group"]==number, "group_name"] = name

for group_title, group_label in groups_of_interest.items():
out = df.copy().loc[(df["group"]==group_label)]

# define columns to include, ie. a list of features of interest (e.g. ageband, ethnicity) per population group
if group_title in features_dict:
Expand Down Expand Up @@ -213,7 +197,6 @@ def filtered_cumulative_sum(df, columns, latest_date):

# figures by demographic/clinical features
for feature in columns:

if feature=="sex":
df = df.loc[df[feature].isin(["M","F"])]

Expand Down Expand Up @@ -248,7 +231,7 @@ def filtered_cumulative_sum(df, columns, latest_date):
return df_dict_temp


def make_vaccine_graphs(df, latest_date, savepath, savepath_figure_csvs, name_of_other_group="other", suffix=""):
def make_vaccine_graphs(df, latest_date, savepath, savepath_figure_csvs, suffix=""):
'''
Cumulative chart by day of total vaccines given across key eligible groups
Expand All @@ -257,21 +240,14 @@ def make_vaccine_graphs(df, latest_date, savepath, savepath_figure_csvs, name_of
latest_date (str): latest date across dataset in YYYY-MM-DD format
savepath (dict): path to save figure as svg (savepath["figures"])
savepath_figure_csvs (str): path to save machine readable csv for recreating the chart
name_of_other_group (str): option to rename "other" group as something more descriptive
groups_of_interest (dict): population subgroups
'''

dfp = df.copy().loc[(df["covid_vacc_date"]!=0)]

dfp["group"] = np.where( dfp["community_ageband"].isin(["80+","70-79","care home", "65-69"]),
dfp["community_ageband"], "other" )
# separate shielding and LD groups out from "other" group
dfp["group"] = np.where( (dfp["shielded"]=="shielding (aged 16-69)") & (dfp["group"]=="other"), "shielding (aged 16-69)", dfp["group"] )
dfp["group"] = np.where( (dfp["LD_group"]=="LD (aged 16-64)") & (dfp["group"]=="other"), "LD (aged 16-64)", dfp["group"] )

dfp = dfp.groupby(["covid_vacc_date","group"])[["patient_id"]].count()

dfp = dfp.groupby(["covid_vacc_date","group_name"])[["patient_id"]].count()
dfp = dfp.unstack().fillna(0).cumsum().reset_index().replace([0,1,2,3,4,5,6],0)

dfp = dfp.rename(columns={"other":name_of_other_group})
dfp["covid_vacc_date"] = pd.to_datetime(dfp["covid_vacc_date"]).dt.strftime("%d %b")
dfp = dfp.set_index("covid_vacc_date")
dfp = round7(dfp)
Expand Down
Loading

0 comments on commit e6acb0c

Please sign in to comment.