Skip to content

Commit

Permalink
Merge pull request #15 from opensafely/weekly-update-apr-6
Browse files Browse the repository at this point in the history
Weekly update apr 6
  • Loading branch information
brianmackenna authored Apr 7, 2021
2 parents 0213bc1 + 90e9e39 commit 92ee356
Show file tree
Hide file tree
Showing 41 changed files with 35,146 additions and 22,543 deletions.
17 changes: 17 additions & 0 deletions analysis/study_definition_delivery.py
Original file line number Diff line number Diff line change
Expand Up @@ -268,5 +268,22 @@
"incidence": 0.1
},
),

# COVID VACCINATION - Moderna
covid_vacc_moderna_date=patients.with_tpp_vaccination_record(
product_name_matches="COVID-19 mRNA (nucleoside modified) Vaccine Moderna 0.1mg/0.5mL dose dispersion for inj MDV",
on_or_after="2020-12-01", # check all december to date
find_first_match_in_period=True,
returning="date",
date_format="YYYY-MM-DD",
return_expectations={
"date": {
"earliest": "2020-04-01", # expected from early april
"latest": index_date,
},
"incidence": 0.1
},
),

**common_variables
)
2 changes: 1 addition & 1 deletion analysis/study_definition_delivery_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
AND (
covid_vacc_date
OR
(age >=65)
(age >=50)
OR
shielded
OR
Expand Down
5 changes: 2 additions & 3 deletions lib/create_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,8 @@


# we create a dict for renaming population variables into suitable longer/correctly capitalised forms for presentation as titles
variable_renaming = {'ageband': "Age band",
'ageband 5yr': "Age band",
'ageband_5yr': "Age band",
variable_renaming = { 'ageband 5yr': "Age band",
'ageband': "Age band",
'sex': "Sex",
'bmi':"BMI",
'ethnicity 6 groups':"Ethnicity (broad categories)",
Expand Down
7 changes: 5 additions & 2 deletions lib/data_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,10 +58,13 @@ def load_data(input_file='input_delivery.csv', input_path="output"):
covid_vacc_flag = np.where(df["covid_vacc_date"]!=0,"vaccinated","unvaccinated"),
covid_vacc_flag_ox = np.where(df["covid_vacc_oxford_date"]!=0, 1, 0),
covid_vacc_flag_pfz = np.where(df["covid_vacc_pfizer_date"]!=0, 1, 0),
covid_vacc_flag_mod = np.where(df["covid_vacc_moderna_date"]!=0, 1, 0),
covid_vacc_2nd = np.where(df["covid_vacc_second_dose_date"]!=0, 1, 0),
covid_vacc_bin = np.where(df["covid_vacc_date"]!=0, 1, 0))


# create an additional field for 2nd dose to use as a flag for each eligible group
df["2nd_dose"] = df["covid_vacc_2nd"]

# Assign column SSRI to be where has SSRI and no psychosis/bipolar/schizophrenia/dementia or LD
df = df.assign(
ssri = np.where((df["ssri"]==1) & (df["psychosis_schiz_bipolar"]==0) &\
Expand Down Expand Up @@ -105,7 +108,7 @@ def load_data(input_file='input_delivery.csv', input_path="output"):
df = df.rename(columns={"shielded_since_feb_15":"newly_shielded_since_feb_15"})

# for each specific situation or condition, replace 1 with YES and 0 with no. This makes the graphs easier to read
for c in ["LD", "newly_shielded_since_feb_15", "dementia",
for c in ["2nd_dose", "LD", "newly_shielded_since_feb_15", "dementia",
"chronic_cardiac_disease", "current_copd", "dialysis", "dmards","psychosis_schiz_bipolar",
"solid_organ_transplantation", "chemo_or_radio", "intel_dis_incl_downs_syndrome","ssri",
"lung_cancer", "cancer_excl_lung_and_haem", "haematological_cancer", "bone_marrow_transplant",
Expand Down
79 changes: 47 additions & 32 deletions lib/report_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ def filtering(d):
return l


def cumulative_sums(df, groups_of_interest, features_dict, latest_date):
def cumulative_sums(df, groups_of_interest, features_dict, latest_date, reference_column_name="covid_vacc_date"):
'''
Calculate cumulative sums across groups
Expand All @@ -108,6 +108,7 @@ def cumulative_sums(df, groups_of_interest, features_dict, latest_date):
groups_of_interest (dict): dict mapping names of population/eligible subgroups to integers (1-9, and 0 for "other")
features_dict (dict): dictionary mapping population subgroups to a list of demographic/clinical factors to include for that group
latest_date (str): "YYYY-MM-DD"
reference_column_name (str): e.g. "covid_vacc_date" for first dose, "covid_vacc_second_dose_date" for second dose
Returns:
df_dict_out (dict): This dict is a mapping from a group name (e.g '80+') to another dict, which is a mapping from a feature name (e.g. 'sex') to a dataframe containing cumulative sums of vaccination data per day.
Expand All @@ -119,7 +120,7 @@ def cumulative_sums(df, groups_of_interest, features_dict, latest_date):
# for each group within the desired groups, it filters to that particular group. it
# also selects columns of interest. For example, in care home, we are interested in
# sex, ageband and broad ethnicity groups. In the analysis of age bands we are interested
# in much more detail such as comorbidies and ethnicity in 16 groups.
# in much more detail such as comorbidities and ethnicity in 16 groups.

# make a new field for the priority groups we are looking at (where any we have not specifically listed are regrouped as 0/"other")
items_to_group = filtering(groups_of_interest)
Expand All @@ -140,15 +141,15 @@ def cumulative_sums(df, groups_of_interest, features_dict, latest_date):
cols = features_dict["DEFAULT"]


df_dict_temp = filtered_cumulative_sum(df=out, columns=cols, latest_date=latest_date)
df_dict_temp = filtered_cumulative_sum(df=out, columns=cols, latest_date=latest_date, reference_column_name=reference_column_name)


df_dict_out[group_title] = df_dict_temp

return df_dict_out


def filtered_cumulative_sum(df, columns, latest_date):
def filtered_cumulative_sum(df, columns, latest_date, reference_column_name="covid_vacc_date"):
"""
This calculates cumulative sums for a dataframe, and when given a set of
characteristics as columns, produces a dictionary of dataframes.
Expand All @@ -158,6 +159,7 @@ def filtered_cumulative_sum(df, columns, latest_date):
YYYY-MM-DD format, a column called 'covid_vacc_date' and a 'covid_vacc_flag'.
columns (list): list of subgroups e.g. ageband, sex
latest_date (datetime object): the date of the latest date of counting vaccines
reference_column_name (str): e.g. "covid_vacc_date" for first dose, "covid_vacc_second_dose_date" for second dose
Returns:
Dict (of dataframes): Each dataframe produced has a date as a row, with the value of the number
Expand All @@ -175,30 +177,32 @@ def filtered_cumulative_sum(df, columns, latest_date):
total = df[["patient_id"]].nunique()[0]

# Copies the dataframe but filters only to those who have had a vaccine recorded
out2 = df.copy().loc[(df["covid_vacc_flag"]=="vaccinated")]
filtered = df.copy().loc[(df[reference_column_name]!=0)]

# group by date of covid vaccines to calculate cumulative sum of vaccines at each date of the campaign
out2 = pd.DataFrame(out2.groupby(["covid_vacc_date"])[["patient_id"]].nunique().unstack().fillna(0).cumsum()).reset_index()
out2 = pd.DataFrame(filtered.groupby([reference_column_name])[["patient_id"]].nunique().unstack().fillna(0).cumsum()).reset_index()
out2 = out2.rename(columns={0:"overall"}).drop(["level_0"],1)

# in case no vaccinations on latest date for some orgs/groups, insert the latest data as a new row with the required date:
if out2["covid_vacc_date"].max()<latest_date:
if out2[reference_column_name].max()<latest_date:
out2.loc[max(out2.index)+1] = [latest_date, out2["overall"].max()]

# suppress low numbers
out2["overall"] = out2["overall"].replace([1,2,3,4,5,6], 0).fillna(0).astype(int)

# Rounds the overall_total values (and makes into integers)
out2["overall_total"] = round7(total)
out2["overall_total"] = round7(total)

# create a percentage by dividing results by total
out2["overall_percent"] = 100*(out2["overall"]/out2["overall_total"])
df_dict_temp["overall"] = out2.set_index("covid_vacc_date")
out2[f"overall_percent"] = 100*(out2["overall"]/out2["overall_total"])

df_dict_temp["overall"] = out2.set_index(reference_column_name)

# figures by demographic/clinical features
for feature in columns:
if feature=="sex":
df = df.loc[df[feature].isin(["M","F"])]
filtered = filtered.loc[filtered[feature].isin(["M","F"])]

# find total number of patients in each subgroup (e.g. no of males and no of females)
totals = df.groupby([feature])[["patient_id"]].nunique().rename(columns={"patient_id":"total"}).transpose()
Expand All @@ -208,8 +212,7 @@ def filtered_cumulative_sum(df, columns, latest_date):

# find total number of patients vaccinated in each subgroup (e.g. no of males and no of females),
# cumulative at each date of the campaign
out2 = df.copy().loc[(df["covid_vacc_flag"]=="vaccinated")]
out2 = out2.groupby([feature, "covid_vacc_date"])["patient_id"].nunique().unstack(0)
out2 = filtered.copy().groupby([feature, reference_column_name])["patient_id"].nunique().unstack(0)
out2 = out2.fillna(0).cumsum()

# suppress low numbers
Expand Down Expand Up @@ -312,6 +315,7 @@ def report_results(df_dict_cum, group, latest_date, breakdown=None):
# for each category in the breakdown
for category in breakdown:
out = df_dict_cum[group][category]
reference_column_name = out.index.name

# calculate changes: select only latest date and 7 days ago:
latest = pd.to_datetime(out.index).max()
Expand Down Expand Up @@ -360,7 +364,7 @@ def report_results(df_dict_cum, group, latest_date, breakdown=None):

##### n, percent and total pop figures for latest date
out2 = df_dict_cum[group][category].reset_index()
out2 = out2.loc[out2["covid_vacc_date"]==latest_date].reset_index().set_index("covid_vacc_date").drop(["index"], 1).transpose()
out2 = out2.loc[out2[reference_column_name]==latest_date].reset_index().set_index(reference_column_name).drop(["index"], 1).transpose()
# split field names e.g. "M_percent" ->"M""percent"
out2.index = pd.MultiIndex.from_tuples(out2.index.str.split('_').tolist())
out2 = out2.unstack().reset_index(col_level=1)
Expand Down Expand Up @@ -432,7 +436,7 @@ def round7(input_):
return ( int(7*round((input_/7),0)) )


def create_summary_stats(df, summarised_data_dict, formatted_latest_date, savepath,
def create_summary_stats(df, summarised_data_dict, formatted_latest_date, savepath, vaccine_type="first_dose",
groups=["80+", "70-79", "care home", "shielding (aged 16-69)"],
suffix=""):
"""
Expand All @@ -449,6 +453,8 @@ def create_summary_stats(df, summarised_data_dict, formatted_latest_date, savep
formatted_latest_date (str): str that is created by running
find_and_save_latest_date()
savepath (dict): location to save summary stats
vaccine_type (str): used in output strings to describe type of vaccine received e.g. "first_dose", "moderna".
Also appended to filename of output.
groups (list): groups of interest.
suffix (str): provider name to append to output
Expand All @@ -463,11 +469,15 @@ def create_summary_stats(df, summarised_data_dict, formatted_latest_date, savep
summary_stats[f"### As at {formatted_latest_date}"] = ""

# get the total vaccinated and round to the nearest 7
vaccinated_total = round7( df.loc[df["covid_vacc_date"]!=0]["patient_id"].nunique() )
if vaccine_type=="first_dose":
reference_column_name="covid_vacc_date"
elif vaccine_type=="second_dose":
reference_column_name="covid_vacc_second_dose_date"
vaccinated_total = round7( df.loc[df[reference_column_name]!=0]["patient_id"].nunique() )

# add the results fo the summary_stats dict
suffix_str = suffix.replace("_","").upper()
summary_stats[f"**Total** population vaccinated in {suffix_str}"] = f"{vaccinated_total:,d}"
summary_stats[f"**Total** population receiving {vaccine_type.replace('_',' ')} in {suffix_str}"] = f"{vaccinated_total:,d}"

# loop through the specified groups and calculate number vaccinated in the groups
# add the results to the dict
Expand All @@ -477,24 +487,29 @@ def create_summary_stats(df, summarised_data_dict, formatted_latest_date, savep
if "not in other eligible groups" not in group:
percent = out.loc[("overall","overall")]["percent"].round(1)
total = out.loc[("overall","overall")]["total"].astype(int)
summary_stats[f"**{group}** population vaccinated"] = f"{vaccinated:,} ({percent}% of {total:,})"
summary_stats[f"**{group}** population receiving {vaccine_type.replace('_',' ')}"] = f"{vaccinated:,} (**{percent}%** of {total:,})"
#out_str = f"**{k}** population vaccinated {vaccinated:,} ({percent}% of {total:,})"
else:
#out_str = f"**{k}** population vaccinated {vaccinated:,}"
summary_stats[f"**{group}** population vaccinated"] = f"{vaccinated:,}"

# count oxford vax as a proportion of total; filter to date of first vax only in case of patients having mixed types
oxford_vaccines = round7(df.copy().loc[df["covid_vacc_date"]==df["covid_vacc_oxford_date"]]["covid_vacc_flag_ox"].sum())
ox_percent = round(100*oxford_vaccines/vaccinated_total, 1)
second_doses = round7(df["covid_vacc_2nd"].sum())
sd_percent = round(100*second_doses/vaccinated_total, 1)

summary_stats[f"#### Vaccine types and second doses"] = ""
summary_stats["Second doses (% of all vaccinated)"] = f"{second_doses:,} ({sd_percent}%)"
summary_stats["Oxford-AZ vaccines (% of all first doses)"] = f"{oxford_vaccines:,} ({ox_percent}%)"
summary_stats[f"**{group}** population receiving {vaccine_type.replace('_',' ')}"] = f"{vaccinated:,}"

# if summarising first doses, perform some additional calculations
if vaccine_type=="first_dose":
# count oxford vax as a proportion of total; filter to date of first vax only in case of patients having mixed types
oxford_vaccines = round7(df.copy().loc[df["covid_vacc_date"]==df["covid_vacc_oxford_date"]]["covid_vacc_flag_ox"].sum())
ox_percent = round(100*oxford_vaccines/vaccinated_total, 1)
moderna_vaccines = round7(df.copy().loc[df["covid_vacc_date"]==df["covid_vacc_moderna_date"]]["covid_vacc_flag_mod"].sum())
mod_percent = round(100*moderna_vaccines/vaccinated_total, 1)
second_doses = round7(df["covid_vacc_2nd"].sum())
sd_percent = round(100*second_doses/vaccinated_total, 1)

summary_stats[f"#### Vaccine types and second doses"] = ""
summary_stats["Second doses (% of all vaccinated)"] = f"{second_doses:,} ({sd_percent}%)"
summary_stats["Oxford-AZ vaccines (% of all first doses)"] = f"{oxford_vaccines:,} ({ox_percent}%)"
summary_stats["Moderna vaccines (% of all first doses)"] = f"{moderna_vaccines:,} ({mod_percent}%)"

# export summary stats to text file
json.dump(summary_stats, open(os.path.join(savepath["text"], "summary_stats.txt"),'w'))
json.dump(summary_stats, open(os.path.join(savepath["text"], f"summary_stats_{vaccine_type}.txt"),'w'))

return summary_stats

Expand Down Expand Up @@ -580,7 +595,7 @@ def plot_dem_charts(summary_stats_results, cumulative_data_dict, formatted_lates
display(Markdown(f"## \n ## COVID vaccination rollout among **{k}** population up to {formatted_latest_date}{org_string}"))

# get the overall vaccination rate among relevant group and strip out the text to get the number (should be within 0 - 100)
overall_rate = float(summary_stats_results[f"**{k}** population vaccinated"].split(" ")[1][1:5])
overall_rate = float(summary_stats_results[f"**{k}** population receiving first dose"].split(" ")[1][3:7])

out=cumulative_data_dict[k]

Expand Down Expand Up @@ -615,7 +630,7 @@ def plot_dem_charts(summary_stats_results, cumulative_data_dict, formatted_lates
# plot trend chart and set chart options
out.plot(legend=True, ds='steps-post')
plt.axhline(overall_rate, color="k", linestyle="--", alpha=0.5)
plt.text(0, overall_rate*1.02, "latest overall national* rate")
plt.text(0, overall_rate*1.02, "latest overall cohort rate")
plt.ylim(top=1.1*max(overall_rate, out.max().max()))
plt.ylabel("Percent vaccinated (cumulative)")
plt.xlabel("Date vaccinated")
Expand Down
Loading

0 comments on commit 92ee356

Please sign in to comment.