Merge pull request #12 from opensafely/weekly-update-mar-23

Weekly update mar 23
opensafely · Mar 24, 2021 · e6acb0c · e6acb0c
2 parents d2dab22 + 4783380
commit e6acb0c
Show file tree

Hide file tree

Showing 38 changed files with 33,837 additions and 21,686 deletions.
diff --git a/lib/create_report.py b/lib/create_report.py
@@ -9,6 +9,7 @@
 # we create a dict for renaming population variables into suitable longer/correctly capitalised forms for presentation as titles
 variable_renaming = {'ageband': "Age band",
                       'ageband 5yr': "Age band",
+                      'ageband_5yr': "Age band",
                       'sex': "Sex",
                       'bmi':"BMI",   
                       'ethnicity 6 groups':"Ethnicity (broad categories)",
@@ -117,8 +118,8 @@ def find_and_sort_filenames(foldername, *,
                         ]
         sort_order = {key: ix for ix, key in enumerate(ordered_dems)}
     elif by_demographics_or_population=="population":
-        ordered_pops = ['80+', '70-79', 'care home', 'shielding (aged 16-69)', '65-69',
-                         'under 65s, not in other eligible groups shown']
+        ordered_pops = ['80+', '70-79', 'care home', 'shielding (aged 16-69)', '65-69', 'LD (aged 16-64)', '60-64', '55-59', '50-54',
+                         'under 60s, not in other eligible groups shown']
         sort_order = {key: ix for ix, key in enumerate(ordered_pops)}
     else:
         display("sort_by_population_or_demographics received an invalid value")
@@ -260,11 +261,11 @@ def show_table(filename, latest_date_fmt, *, org_breakdown=None, show_carehomes=
         display(Markdown(f"- Population includes those known to live in an elderly care home, based upon clinical coding."))
     elif "shielding" in title:
         display(Markdown(f"- Population excludes those over 65 known to live in an elderly care home, based upon clinical coding."))
-    elif ("80+" in title) | ("70-79" in title) | ("65-69" in title):
+    elif ("80+" in title) | ("70-79" in title) | ("65-69" in title): # don't include under 65s here
         display(Markdown(f"- Population excludes those known to live in an elderly care home, based upon clinical coding."))
 
     # display note that 65-69 and LD group excludes shielding subgroup
-    if ("65-69" in title) | ("LD (aged 16-64)" in title):
+    if ("65-69" in title) | ("60-64" in title)  | ("55-59" in title) | ("50-54" in title) | ("LD (aged 16-64)" in title):
         display(Markdown(f"- Population excludes those who are currently shielding."))
 
 

diff --git a/lib/data_processing.py b/lib/data_processing.py
@@ -80,27 +80,30 @@ def load_data(input_file='input_delivery.csv', input_path="output"):
     df = df.assign(bmi = np.where((df["bmi"]=="Not obese"), "under 30", "30+"))
 
     # drop unnecssary columns or columns created for processing 
-    df = df.drop(["imd","ethnicity_16", "ethnicity", "adrenaline_pen", "has_died", "has_follow_up"], 1)
-
-
-
-    # shielding: keep flag only for under 70s
-    df["shielded"] = np.where((df["shielded"]==1) & (df["age"]<70), "shielding (aged 16-69)", "")
+    df = df.drop(["imd","ethnicity_16", "ethnicity", 'ethnicity_6_sus',
+       'ethnicity_16_sus', "adrenaline_pen", "has_died", "has_follow_up"], 1)
+
+    # categorise into priority groups (similar to the national groups but not exactly the same)
+    conditions = [
+        (df["care_home"]==1) & (df["age"]>=65),
+        (df["age"]>=80),
+        (df["age"]>=70),
+        (df["shielded"]==1),
+        (df["age"]>=65),
+        (df["LD"]==1),
+        (df["age"]>=60),
+        (df["age"]>=55),
+        (df["age"]>=50)]
+    choices = [3,1,2,4,5,6,7,8,9]
+    # note the numbers here denote the desired sort order in which we want to look at these groups, not the priority order
 
-    ###### care homes #####
-    # amend community age band to remove any care home flags for under 65s 
-    df.loc[(df["ageband_community"]=="care home") & (df["age"]<60), "ageband_community"] = df["ageband"] # 10 yr age band
-    df.loc[(df["ageband_community"]=="care home") & (df["age"]>=60) & (df["age"]<65), "ageband_community"] = "60-64" # 5 yr age band
-
-    # amend community age band to remove any people shielding from the under 70s groups (they will be reported in shielded group) 
-    df.loc[(df["ageband_community"]!="care home") & (df["age"]<70) & (df["shielded"]=="shielding (aged 16-69)"), "ageband_community"] = "shielding (aged 16-69)"
+    # create field "priority_group" which uses the appropriate value from `choices` according to the `conditions` met by each line of data. If none are met, assigns 0.
+    # Eg. for patient aged 71 but not in a care home, patient does not meet the first or second criteria, but meets the third so is assigned to the third of the `choices` i.e. `2`.
+    df['priority_group'] = np.select(conditions, choices, default=0)
 
     # rename column for clarity
     df = df.rename(columns={"shielded_since_feb_15":"newly_shielded_since_feb_15"})
 
-    # LD: keep flag only for under 65s and those not shielding
-    df["LD_group"] = np.where((df["LD"]==1) & (df["age"]<65) &(df["shielded"]==""), "LD (aged 16-64)", "")
-
     # for each specific situation or condition, replace 1 with YES and 0 with no. This makes the graphs easier to read
     for c in ["LD", "newly_shielded_since_feb_15", "dementia", 
           "chronic_cardiac_disease", "current_copd", "dialysis", "dmards","psychosis_schiz_bipolar",
@@ -110,15 +113,13 @@ def load_data(input_file='input_delivery.csv', input_path="output"):
           "temporary_immunosuppression", "asplenia"]:
           df[c] = np.where(df[c]==1, "yes", "no")
 
-    # rename columns for agebands for consistency
-    df = df.rename(columns={"ageband_community":"community_ageband"})
 
     # get total population sizes and names for each STP
     stps = pd.read_csv(os.path.join("..","lib","stp_dict.csv"), usecols=["stp_id","name","list_size_o80"])
     df = df.merge(stps, left_on="stp", right_on="stp_id", how="left").rename(columns={"name":"stp_name"})
 
     # drop additional columns
-    df = df.drop(["age","stp_id"], 1)  
+    df = df.drop(['registered', 'care_home', 'age',"stp_id", "ageband_community"], 1)  
 
     return df
 

diff --git a/lib/data_quality.py b/lib/data_quality.py
@@ -7,7 +7,7 @@
 # import custom functions from 'lib' folder
 import sys
 sys.path.append('../lib/')
-from report_results import create_output_dirs, filter_other_group, round7
+from report_results import create_output_dirs, round7
 
 def ethnicity_completeness(df, groups_of_interest):
 
@@ -16,33 +16,24 @@ def ethnicity_completeness(df, groups_of_interest):
     
     Inputs:
     df (dataframe): processed patient-level dataframe containing "ethnicity_6_groups" column,
-                    as well as "community_ageband" (for filtering to given group) and "patient_id" (for counting)
-    group (str): group of interest e.g. "80+"
-    name_of_other_group (str): name to give for the general population who are vaccinated but do not have recorded eligibility factors
-    groups_not_in_other_group (list): groups to exclude from "other" group, i.e. all currently included criteria for eligibility
+                    as well as "group"&""group_name" (to identify vaccine priority group) and "patient_id" (for counting)
+    groups_of_interest (dict): dict mapping names of population/eligible subgroups to integers (1-9, and 0 for "other")
     
     Outputs:
     displays string describing n and % of given group with ethnicity known
     
     '''
     # create copy of df only with cols of interest
-    cols = [v for v in set(groups_of_interest.values()) if v != "other"]
-    cols.extend(["ethnicity_6_groups","patient_id"])   
-
+    cols = ["group", "group_name", "ethnicity_6_groups","patient_id"]   
+
     ethnicity_coverage = pd.DataFrame(columns=["group", "n with ethnicity", "total population (n)", "ethnicity coverage (%)"])
 
-    for i, (group_title, group_label) in enumerate(groups_of_interest.items()):
+
+    for i, (groupname, groupno) in enumerate(groups_of_interest.items()):
         out = df[cols].copy()
         # filter dataframe to eligible group
-        if group_label == "other": # for "all others" filter out the each of the defined groups
-            # we want to exclude all the other eligible groups from the "other" group
-            out = filter_other_group(out, groups_of_interest=groups_of_interest)
-        elif group_label != "community_ageband": # for groups not defined as age bands or care home, filter out care home population
-            out = out.loc[(out["community_ageband"]!="care home") & (out[group_label]==group_title)]
-        # will need a further filter for the "clinically vulnerable" group here
-        else:    # age groups / care home
-            out = out.loc[(out[group_label]==group_title)]
-
+        out = out.loc[(out["group_name"]==groupname)]
+
         total = round7(out["patient_id"].nunique())
 
         known_eth = out.groupby("ethnicity_6_groups")[["patient_id"]].nunique().reset_index()
@@ -52,11 +43,13 @@ def ethnicity_completeness(df, groups_of_interest):
 
         # export ethnicity coverage stats to text file
         savepath, _, _ = create_output_dirs()
-        ethnicity_coverage.loc[i] = [group_title, known_eth, total, percent]
+        if groupno == 0: # for the other group the denominator is unknown and only vaccinated people are included
+            groupname ="vaccinated "+groupname
+        ethnicity_coverage.loc[i] = [groupname, known_eth, total, percent]
         ethnicity_coverage.to_csv(os.path.join(savepath["text"], "ethnicity_coverage.csv"), index=False)
 
 
-        display(Markdown(f"Total **{group_title}** population with ethnicity recorded {known_eth:,d} ({percent}%)"))
+        display(Markdown(f"Total **{groupname}** population with ethnicity recorded {known_eth:,d} ({percent}%)"))
 
 
 

diff --git a/lib/report_results.py b/lib/report_results.py
@@ -80,37 +80,23 @@ def find_and_save_latest_date(df, savepath, reference_column_name="covid_vacc_da
     return latest_date, latest_date_fmt
 
 
-def remove_other_key(d):
-    '''
-    Mutate a dictionary to remove key which has value "other"
+def filtering(d):
     '''
+    Find items from the full set of single digit numbers (0-9) which are not present as values in a given dict
     
-    keys = list(d.keys())
-    d2 = d.copy()
-    for key, val in d.items():
-        if val == "other":
-            del d2[key]
-    return d2
-
-
-def filter_other_group(df, groups_of_interest):
-    '''
-    Exclude all the other eligible groups from the "other" group in dataframe provided. "other" should be contained in the groups_of_interest.
+    Inputs:
+    d (dict): a dict mapping strings to numeric `values`
     
-    Args:
-        df (dataframe): input data
-        groups_of_interest (dict): subgroups to breakdown by (and column on which to filter for these groups)
+    Outputs:
+    l (list): a list containing zero and all the single digit numbers (0-9) which do not appear in d
     
-    Returns:
-        df (dataframe)
     '''
-    groups_to_exclude = groups_of_interest.copy()
-
-    groups_to_exclude = remove_other_key(groups_to_exclude)
-    for group_to_exclude, filter_col in groups_to_exclude.items():
-        df = df.loc[(~df[filter_col].isin([group_to_exclude]))]
-
-    return df
+    all_keys = [0,1,2,3,4,5,6,7,8,9]
+    keys = list(d.values())
+
+    # check which of `all_keys` are absent in `keys` and return them as a list (but always include 0)
+    l = [k for k in all_keys if ((k not in keys)|(k==0))]
+    return l
 
 
 def cumulative_sums(df, groups_of_interest, features_dict, latest_date):
@@ -119,7 +105,7 @@ def cumulative_sums(df, groups_of_interest, features_dict, latest_date):
     
     Args:
         df (dataframe): input data
-        groups_of_interest (dict): subgroups to breakdown by (and column on which to filter for these groups)
+        groups_of_interest (dict): dict mapping names of population/eligible subgroups to integers (1-9, and 0 for "other")
         features_dict (dict): dictionary mapping population subgroups to a list of demographic/clinical factors to include for that group
         latest_date (str): "YYYY-MM-DD"
     
@@ -134,18 +120,16 @@ def cumulative_sums(df, groups_of_interest, features_dict, latest_date):
     # also selects columns of interest. For example, in care home, we are interested in 
     # sex, ageband and broad ethnicity groups. In the analysis of age bands we are interested
     # in much more detail such as comorbidies and ethnicity in 16 groups. 
-    for group_title, group_label in groups_of_interest.items():
 
-        # filter dataframe to eligible group
-        if group_label == "other": # for "all others" filter out the each of the defined groups
-            out = df.copy()
-            # we want to exclude all the other eligible groups from the "other" group
-            out = filter_other_group(out, groups_of_interest=groups_of_interest)
-        elif group_label != "community_ageband": # for groups not defined as age bands or care home, filter out care home population
-            out = df.copy().loc[(df["community_ageband"]!="care home") & (df[group_label]==group_title)]
-        # will need a further filter for the "clinically vulnerable" group here
-        else:    # age groups / care home
-            out = df.copy().loc[(df[group_label]==group_title)]
+    # make a new field for the priority groups we are looking at (where any we have not specifically listed are regrouped as 0/"other")
+    items_to_group = filtering(groups_of_interest)
+    df["group"] = np.where(df["priority_group"].isin(items_to_group), 0, df["priority_group"])
+    # translate number into name
+    for name, number in groups_of_interest.items():
+        df.loc[df["group"]==number, "group_name"] = name
+
+    for group_title, group_label in groups_of_interest.items():
+        out = df.copy().loc[(df["group"]==group_label)]
 
         # define columns to include, ie. a list of features of interest (e.g. ageband, ethnicity) per population group 
         if group_title in features_dict:
@@ -213,7 +197,6 @@ def filtered_cumulative_sum(df, columns, latest_date):
 
     # figures by demographic/clinical features
     for feature in columns:
-
         if feature=="sex":
             df = df.loc[df[feature].isin(["M","F"])]
 
@@ -248,7 +231,7 @@ def filtered_cumulative_sum(df, columns, latest_date):
     return df_dict_temp
 
 
-def make_vaccine_graphs(df, latest_date, savepath, savepath_figure_csvs, name_of_other_group="other", suffix=""):
+def make_vaccine_graphs(df, latest_date, savepath, savepath_figure_csvs, suffix=""):
     '''
     Cumulative chart by day of total vaccines given across key eligible groups
     
@@ -257,21 +240,14 @@ def make_vaccine_graphs(df, latest_date, savepath, savepath_figure_csvs, name_of
         latest_date (str): latest date across dataset in YYYY-MM-DD format
         savepath (dict): path to save figure as svg (savepath["figures"])
         savepath_figure_csvs (str): path to save machine readable csv for recreating the chart
-        name_of_other_group (str): option to rename "other" group as something more descriptive 
+        groups_of_interest (dict): population subgroups 
     '''
 
     dfp = df.copy().loc[(df["covid_vacc_date"]!=0)]
-
-    dfp["group"] = np.where( dfp["community_ageband"].isin(["80+","70-79","care home", "65-69"]), 
-                            dfp["community_ageband"], "other" )
-    # separate shielding and LD groups out from "other" group
-    dfp["group"] = np.where( (dfp["shielded"]=="shielding (aged 16-69)") & (dfp["group"]=="other"), "shielding (aged 16-69)", dfp["group"] )
-    dfp["group"] = np.where( (dfp["LD_group"]=="LD (aged 16-64)") & (dfp["group"]=="other"), "LD (aged 16-64)", dfp["group"] )
-
-    dfp = dfp.groupby(["covid_vacc_date","group"])[["patient_id"]].count()  
+
+    dfp = dfp.groupby(["covid_vacc_date","group_name"])[["patient_id"]].count()  
     dfp = dfp.unstack().fillna(0).cumsum().reset_index().replace([0,1,2,3,4,5,6],0) 
 
-    dfp = dfp.rename(columns={"other":name_of_other_group})
     dfp["covid_vacc_date"] = pd.to_datetime(dfp["covid_vacc_date"]).dt.strftime("%d %b")
     dfp = dfp.set_index("covid_vacc_date")
     dfp = round7(dfp)