generated from opensafely/research-template
-
Notifications
You must be signed in to change notification settings - Fork 6
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #5 from opensafely/weekly-update-feb-9th
Weekly update feb 9th
- Loading branch information
Showing
20 changed files
with
5,209 additions
and
4,000 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -15,5 +15,4 @@ code,name | |
14,Other Black | ||
15, Chinese | ||
16, Other | ||
17, Not stated | ||
, | ||
17, Not stated |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,139 @@ | ||
""" This module cleans the patient-level data containing COVID vaccination dates and other relevant information""" | ||
|
||
# Import statements | ||
import pandas as pd | ||
import numpy as np | ||
import os | ||
|
||
# Errors | ||
from errors import DataCleaningError | ||
|
||
|
||
|
||
def load_data(input_file='input_delivery.csv', input_path="output"): | ||
""" | ||
This reads in a csv that must be in output/ directory and cleans the | ||
data ready for use in the graphs and tables | ||
Args: | ||
input_file (str): name of the input file. Default value is `input_delivery.csv' | ||
The csv should contain one-row-per-patient, | ||
with columns representing various outcomes (covid vaccine dates), | ||
demographics and clinical flags | ||
input_path (str): folder in which to find the input file | ||
Returns: | ||
Dataframe (df): Process dataframe | ||
Raises: | ||
DataCleaningError: If unable to complete any of the data loading or cleaning | ||
processes | ||
""" | ||
|
||
# import data and fill nulls with 0 | ||
df = pd.read_csv(os.path.join("..",input_path, input_file)).fillna(0) | ||
|
||
# convert ethnic categories to words. There are 2 ways of categorising - into | ||
# 6 groups or into 16 groups. | ||
# this creates a new column called `ethnicity_6_groups` | ||
df = map_ethnicity(df, columnname="ethnicity", number_of_groups=6) | ||
|
||
# this creates a new column called `ethnicity_16_groups` | ||
df = map_ethnicity(df, columnname="ethnicity_16", number_of_groups=16) | ||
|
||
# describe imd partially in words and make new column called `imd_categories` from `imd` | ||
imd_lookup = {0:"Unknown", 1:"1 Most deprived", 2:"2", 3:"3", 4:"4", 5:"5 Least deprived"} | ||
for level, category in imd_lookup.items(): | ||
df.loc[df["imd"] == level, "imd_categories"] = category | ||
df["imd_categories"].fillna("Unknown") | ||
|
||
# Assign vaccine status | ||
df = df.assign( | ||
covid_vacc_flag = np.where(df["covid_vacc_date"]!=0,"vaccinated","unvaccinated"), | ||
covid_vacc_flag_ox = np.where(df["covid_vacc_oxford_date"]!=0, 1, 0), | ||
covid_vacc_flag_pfz = np.where(df["covid_vacc_pfizer_date"]!=0, 1, 0), | ||
covid_vacc_2nd = np.where(df["covid_vacc_second_dose_date"]!=0, 1, 0), | ||
covid_vacc_bin = np.where(df["covid_vacc_date"]!=0, 1, 0)) | ||
|
||
|
||
# Assign column SSRI to be where has SSRI and no psychosis/bipolar/schizophrenia/dementia or LD | ||
df = df.assign( | ||
ssri = np.where((df["ssri"]==1) & (df["psychosis_schiz_bipolar"]==0) &\ | ||
(df["intel_dis_incl_downs_syndrome"]==0) & (df["dementia"]==0), 1, 0)) | ||
|
||
# Replace a region and STP with a value `0` with Unknown | ||
df = df.assign( | ||
region = df['region'].replace(0, "Unknown"), | ||
stp = df['stp'].replace(0, "Unknown")) | ||
|
||
# Replace `I` or `U` for sex with `Other/Unknown` | ||
df = df.assign( | ||
sex = df['sex'].replace(['I','U'], "Other/Unknown")) | ||
|
||
# categorise BMI into obese (i.e. BMI >=30) or non-obese (<30) | ||
df = df.assign(bmi = np.where((df["bmi"]=="Not obese"), "under 30", "30+")) | ||
|
||
# drop unnecssary columns or columns created for processing | ||
df = df.drop(["imd","ethnicity_16", "ethnicity", "adrenaline_pen", "has_died", "has_follow_up"], 1) | ||
|
||
# care homes: regroup age bands (to later keep only 65+ labelled as care home residents) | ||
df.loc[(df["care_home_type"].isin(["PS","PN","PC"])) & (df["age"]>=65) & (df["age"]<70), "ageband"] = "65-69" | ||
|
||
# amend community age band to remove any care home flags for under 65s | ||
#(only elderly care homes are included so these are likely live-in staff+their families or other non-care recipients) | ||
df.loc[(df["ageband_community"]=="care home") & (df["age"]<65), "ageband_community"] = df["ageband"] | ||
|
||
# for each specific situation or condition, replace 1 with YES and 0 with no. This makes the graphs easier to read | ||
for c in ["care_home","dementia", | ||
"chronic_cardiac_disease", "current_copd", "dialysis", "dmards","psychosis_schiz_bipolar", | ||
"solid_organ_transplantation", "chemo_or_radio", "intel_dis_incl_downs_syndrome","ssri", | ||
"lung_cancer", "cancer_excl_lung_and_haem", "haematological_cancer", "bone_marrow_transplant", | ||
"cystic_fibrosis", "sickle_cell_disease", "permanant_immunosuppression", | ||
"temporary_immunosuppression", "asplenia"]: | ||
df[c] = np.where(df[c]==1, "yes", "no") | ||
|
||
# rename columns for IMD and ageband for readability | ||
df = df.rename(columns={"imd":"Index_of_Multiple_Deprivation", "ageband_community":"community_ageband"}) | ||
|
||
# get total population sizes and names for each STP | ||
stps = pd.read_csv(os.path.join("..","lib","stp_dict.csv"), usecols=["stp_id","name","list_size_o80"]) | ||
df = df.merge(stps, left_on="stp", right_on="stp_id", how="left").drop(["care_home_type","age","stp_id"], 1).rename(columns={"name":"stp_name"}) | ||
|
||
return df | ||
|
||
|
||
|
||
|
||
def map_ethnicity(df, columnname, number_of_groups): | ||
""" | ||
This maps the numerical value in the dataframe to the ethnicity categories. It creates | ||
a new column called ethnicity_(number_of_groups)_groups. For example, ethnicity_6_groups. | ||
Args: | ||
df (Dataframe): your dataframe of interest, | ||
with one row per patient where at least one column (columnname) contains an ethnicity code | ||
columnname (str): name of column containing ethnicity codes (with range 0-5 or 0-15) | ||
number_of_groups (int): Either 6 or 16 | ||
Returns: | ||
Dataframe (df): Processed dataframe with a column added containing the name for each ethnicity category | ||
""" | ||
|
||
if number_of_groups == 6: | ||
ethnicity_dict = {0:"Unknown", 1:"White", 2:"Mixed", 3:"South Asian", 4:"Black", 5:"Other"} | ||
elif number_of_groups == 16: | ||
ethnicity_lookup = pd.read_csv(os.path.join("..", "analysis", "ethnicity_16_lookup.csv")).to_dict('index') | ||
ethnicity_dict = {0:"Unknown"} | ||
for row, data in ethnicity_lookup.items(): | ||
ethnicity_dict[(int(data["code"]))] = data["name"] | ||
else: | ||
raise DataCleaningError(message="You have provided a non-supported number of categories (only 6 or 16 are supported)") | ||
|
||
new_ethnicity_column_name = f"ethnicity_{number_of_groups}_groups" | ||
df[new_ethnicity_column_name] = [ethnicity_dict[x] for x in df[columnname].fillna(0).astype(int)] | ||
|
||
return df | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
"Module for data quality checks" | ||
|
||
from IPython.display import display, Markdown | ||
import pandas as pd | ||
|
||
|
||
def ethnicity_completeness(df, group, | ||
name_of_other_group="all other vaccinated people, not in eligible groups shown", | ||
groups_not_in_other_group=["care home","80+", "70-79"]): | ||
|
||
''' | ||
Describe completeness of ethnicity coding for group of interest | ||
Inputs: | ||
df (dataframe): processed patient-level dataframe containing "ethnicity_6_groups" column, | ||
as well as "community_ageband" (for filtering to given group) and "patient_id" (for counting) | ||
group (str): group of interest e.g. "80+" | ||
name_of_other_group (str): name to give for the general population who are vaccinated but do not have recorded eligibility factors | ||
groups_not_in_other_group (list): groups to exclude from "other" group, i.e. all currently included criteria for eligibility | ||
Outputs: | ||
displays string describing n and % of given group with ethnicity known | ||
''' | ||
# group without denominator | ||
if group == "other": | ||
out = df[["community_ageband","ethnicity_6_groups","patient_id"]].copy() | ||
out = out.loc[~out["community_ageband"].isin(groups_not_in_other_group)] | ||
group = name_of_other_group | ||
|
||
# in subgroups with denominators | ||
else: | ||
out = df[["community_ageband","ethnicity_6_groups","patient_id"]].copy() | ||
out = out.loc[out["community_ageband"]==group] | ||
|
||
total = out["patient_id"].nunique() | ||
|
||
known_eth = out.groupby("ethnicity_6_groups")[["patient_id"]].nunique().reset_index() | ||
known_eth = known_eth.loc[known_eth["ethnicity_6_groups"]!="Unknown"]["patient_id"].sum() | ||
|
||
percent = round(100*(known_eth/total), 1) | ||
|
||
|
||
display(Markdown(f"Total **{group}** population with ethnicity recorded {known_eth:,d} ({percent}%)")) | ||
|
||
|
||
|
||
def care_home_flag_comparison(df): | ||
''' | ||
Compare number of patients flagged with each different care home flag | ||
Inputs: | ||
df (dataframe): processed patient-level dataframe | ||
Outputs: | ||
display text describing care home population according to each flag | ||
''' | ||
|
||
df = df[["care_home_primis","care_home","patient_id","ageband"]].copy().loc[(df["care_home_primis"]==1)|(df["care_home"]=="yes")].loc[df["ageband"].isin(["65-69","60-69","70-79","80+"])] | ||
out = df.groupby(["care_home_primis","care_home"])[["patient_id"]].nunique().unstack().fillna(0).astype(int) | ||
out = out.to_numpy() | ||
out_dict={} | ||
out_dict["address flag"]=out[0][1] | ||
out_dict["snomed flag"]=out[1][0] | ||
out_dict["both"]=out[1][1] | ||
total = sum(out_dict.values()) | ||
display(Markdown("#### Care home flags")) | ||
for p, v in out_dict.items(): | ||
percent = 100*round(v/total, 3) | ||
display(Markdown(f"Patients with {p} = {v} ({percent}%)")) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
class DataCleaningError(Exception): | ||
pass |
Oops, something went wrong.