From ebb8a44a72ade78268ff11925c181b3b5526fbcf Mon Sep 17 00:00:00 2001 From: andrewscolm <73839417+andrewscolm@users.noreply.github.com> Date: Wed, 6 Dec 2023 15:29:55 +0000 Subject: [PATCH] add ctv3 sus analysis --- analysis/local/appendix.R | 174 ++++----- analysis/local/local_manuscript_plots.R | 129 ++++++- analysis/sus/lib_phenotype_validation_sus.py | 359 +++++++++++++----- .../sus/simple_validation_script_ctv3_sus.py | 142 +++++++ project.yaml | 8 + 5 files changed, 618 insertions(+), 194 deletions(-) create mode 100644 analysis/sus/simple_validation_script_ctv3_sus.py diff --git a/analysis/local/appendix.R b/analysis/local/appendix.R index f170f7c..13fe497 100644 --- a/analysis/local/appendix.R +++ b/analysis/local/appendix.R @@ -73,67 +73,67 @@ SUS_gt %>% gtsave(here::here("output","released","made_locally","patient_counts -# # patient counts 5 group -# SUS5yesno <- read_csv(here::here("output","released","made_locally","local_patient_counts_categories_5_registered.csv")) %>% -# filter(subgroup == "Yes" | subgroup == "No") %>% -# mutate(subgroup =recode(subgroup, Yes = "Present", -# No = "Absent" -# )) %>% -# arrange(group,rev(subgroup)) -# -# SUS5<-read_csv(here::here("output","released","made_locally","local_patient_counts_categories_5_registered.csv")) %>% -# filter(subgroup != "Yes" & subgroup != "No") %>% -# bind_rows(SUS5yesno) %>% -# mutate(group = case_when(group == 'age_band' ~ 'age band', -# group == 'learning_disability' ~ 'learning disability', -# group == "imd" ~ "IMD", -# TRUE ~ group), -# subgroup =recode(subgroup, F = "Female", -# M = "Male" -# ) -# ) %>% -# filter(`Asian 5 SNOMED:2022` !="- (-)") -# -# my_cols <- setNames(c("group","",rep(c("SNOMED 2022","SNOMED 2022 with SUS data"),5)),names(SUS5)) -# -# -# SUS5 <- SUS5 %>% -# gt( groupname_col = "group") %>% -# tab_spanner(label="Asian", columns=c(3,4)) %>% -# tab_spanner(label="Black", columns=c(5,6)) %>% -# tab_spanner(label="Mixed", columns=c(7,8)) %>% -# tab_spanner(label="White", columns=c(9,10)) %>% -# tab_spanner(label="Other", columns=c(11,12)) %>% -# cols_label(!!!my_cols) %>% -# tab_style( -# style = list( -# cell_fill(color = "gray96") -# ), -# locations = cells_body( -# ) -# ) %>% -# tab_style( -# style = list( -# cell_text(weight = "bold") -# ), -# locations = cells_column_labels(everything()) -# ) %>% -# tab_options( -# table.align = "left", -# # row_group.as_column = TRUE option not available on the OS R image -# row_group.as_column = TRUE, -# table.font.size = 8, -# column_labels.border.top.width = px(3), -# column_labels.border.top.color = "transparent", -# table.border.top.color = "transparent", -# heading.align = "left" -# ) %>% -# tab_header( -# title = md("Table 2: Count of patients with a recorded ethnicity in OpenSAFELY TPP by ethnicity group (proportion of registered TPP population) and clinical and demographic subgroups. All counts are rounded to the nearest 5. "), -# ) -# -# SUS5 %>% gtsave(here::here("output","released","made_locally","patient_counts_5_group.html")) -# +# patient counts 5 group +SUS5yesno <- read_csv(here::here("output","released","made_locally","local_patient_counts_categories_5_registered.csv")) %>% + filter(subgroup == "Yes" | subgroup == "No") %>% + mutate(subgroup =recode(subgroup, Yes = "Present", + No = "Absent" + )) %>% + arrange(group,rev(subgroup)) + +SUS5<-read_csv(here::here("output","released","made_locally","local_patient_counts_categories_5_registered.csv")) %>% + filter(subgroup != "Yes" & subgroup != "No") %>% + bind_rows(SUS5yesno) %>% + mutate(group = case_when(group == 'age_band' ~ 'age band', + group == 'learning_disability' ~ 'learning disability', + group == "imd" ~ "IMD", + TRUE ~ group), + subgroup =recode(subgroup, F = "Female", + M = "Male" + ) + ) %>% + filter(`Asian 5 SNOMED:2022` !="- (-)") + +my_cols <- setNames(c("group","",rep(c("SNOMED 2022","SNOMED 2022 with SUS data"),5)),names(SUS5)) + + +SUS5 <- SUS5 %>% + gt( groupname_col = "group") %>% + tab_spanner(label="Asian", columns=c(3,4)) %>% + tab_spanner(label="Black", columns=c(5,6)) %>% + tab_spanner(label="Mixed", columns=c(7,8)) %>% + tab_spanner(label="White", columns=c(9,10)) %>% + tab_spanner(label="Other", columns=c(11,12)) %>% + cols_label(!!!my_cols) %>% + tab_style( + style = list( + cell_fill(color = "gray96") + ), + locations = cells_body( + ) + ) %>% + tab_style( + style = list( + cell_text(weight = "bold") + ), + locations = cells_column_labels(everything()) + ) %>% + tab_options( + table.align = "left", + # row_group.as_column = TRUE option not available on the OS R image + row_group.as_column = TRUE, + table.font.size = 8, + column_labels.border.top.width = px(3), + column_labels.border.top.color = "transparent", + table.border.top.color = "transparent", + heading.align = "left" + ) %>% + tab_header( + title = md("Table 2: Count of patients with a recorded ethnicity in OpenSAFELY TPP by ethnicity group (proportion of registered TPP population) and clinical and demographic subgroups. All counts are rounded to the nearest 5. "), + ) + +SUS5 %>% gtsave(here::here("output","released","made_locally","patient_counts_5_group.html")) + @@ -204,7 +204,7 @@ SUS16 <- SUS16 %>% heading.align = "left" ) %>% tab_header( - title = md("Table 2: Count of patients with a recorded ethnicity in OpenSAFELY TPP by ethnicity group (proportion of registered TPP population) and clinical and demographic subgroups. All counts are rounded to the nearest 5."), + title = md("Table 3: Count of patients with a recorded ethnicity in OpenSAFELY TPP by ethnicity group (proportion of registered TPP population) and clinical and demographic subgroups. All counts are rounded to the nearest 5."), ) SUS16 %>% gtsave(here::here("output","released","made_locally","patient_counts_16_group.html")) @@ -247,7 +247,7 @@ latestcommon <- read_csv(here::here("output","released","made_locally","local_la heading.align = "left" ) %>% tab_header( - title = md("Table 3: Count of patients’ most frequently recorded ethnicity (proportion of latest ethnicity). "), + title = md("Table 4: Count of patients’ most frequently recorded ethnicity (proportion of latest ethnicity). "), ) latestcommon %>% gtsave(here::here("output","released","made_locally","latest_frequent.html")) @@ -270,7 +270,7 @@ listed <- read_csv(here::here("output","released","ethnicity","snomed_ethnicity_ heading.align = "left" ) %>% tab_header( - title = md("Table 8: Count of individual ethnicity code use"), + title = md("Table 9: Count of individual ethnicity code use"), ) listed %>% gtsave(here::here("output","released","made_locally","percodelist.html")) @@ -300,14 +300,14 @@ plot_time<- time %>% ggplot(aes(x=Date,y= n,colour=measure))+ theme(legend.title=element_blank()) + scale_x_date(breaks = breakvec, date_labels = "%Y") + scale_y_continuous(name="Number of records", labels = scales::comma) + - ggtitle("Figure 4: Recording of ethnicity over time for latest and first recorded ethnicity. Unknown dates of recording may be stored as '1900-01-01'") + ggtitle("Figure 3: Recording of ethnicity over time for latest and first recorded ethnicity. Unknown dates of recording may be stored as '1900-01-01'") ggsave( filename = here::here( "output", "released", "made_locally", - "fig_4_plot_time.pdf" + "fig_3_plot_time.pdf" ), plot_time, dpi = 600, @@ -377,7 +377,7 @@ ONS_tab_2021 %>% heading.align = "left" ) %>% tab_header( - title = md("Table 7: Count of patients with a recorded ethnicity in OpenSAFELY TPP [amended to the 2021 ethnicity grouping] (proportion of registered TPP population) and 2021 ONS Census counts (proportion of 2021 ONS Census population). All counts are rounded to the nearest 5. "), + title = md("Table 8: Count of patients with a recorded ethnicity in OpenSAFELY TPP [amended to the 2021 ethnicity grouping] (proportion of registered TPP population) and 2021 ONS Census counts (proportion of 2021 ONS Census population). All counts are rounded to the nearest 5. "), ) %>% gtsave(here::here("output","released","made_locally","ons_table_2021_with_2021_categories.html")) @@ -434,7 +434,7 @@ ONS_tab_2001 %>% heading.align = "left" ) %>% tab_header( - title = md("Table 6: Count of patients with a recorded ethnicity in OpenSAFELY TPP by ethnicity group (proportion of registered TPP population) and 2021 ONS Census counts [amended to 2001 grouping] (proportion of 2021 ONS Census population). All counts are rounded to the nearest 5. "), + title = md("Table 7: Count of patients with a recorded ethnicity in OpenSAFELY TPP by ethnicity group (proportion of registered TPP population) and 2021 ONS Census counts [amended to 2001 grouping] (proportion of 2021 ONS Census population). All counts are rounded to the nearest 5. "), ) %>% gtsave(here::here("output","released","made_locally","ons_table_2021_with_2001_categories.html")) @@ -478,7 +478,7 @@ ethnicity_plot_na_2021 <- ethnicity_na_2021 %>% theme(legend.position="bottom", legend.title=element_blank()) + geom_text(aes(x=Ethnic_Group,y=percentage,label=ifelse(cohort=="2021 Census","",paste0(round(diff,digits =1),"%"))), size=3.4, position =position_dodge(width=0.9), vjust=0.3,hjust = -0.2) + - ggtitle("Figure 3: Barplot showing the proportion of 2021 Census and TPP populations (amended to 2021 grouping) per ethnicity grouped into 5 groups per NUTS-1 region (excluding\nthose without a recorded ethnicity). Annotated with percentage point difference between 2021 Census and TPP populations.") + + ggtitle("Figure 2: Barplot showing the proportion of 2021 Census and TPP populations (amended to 2021 grouping) per ethnicity grouped into 5 groups per NUTS-1 region (excluding\nthose without a recorded ethnicity). Annotated with percentage point difference between 2021 Census and TPP populations.") + theme(plot.title = element_text(size = 16)) @@ -510,7 +510,7 @@ ethnicity_plot_eng_na_2021 <- ethnicity_na_2021 %>% theme(legend.position="bottom", legend.title=element_blank()) + geom_text(aes(x=Ethnic_Group,y=percentage,label=ifelse(cohort=="2021 Census","",paste0(round(diff,digits =1),"%"))), size=3.4, position =position_dodge(width=0.9), vjust=0.3 ,hjust = -0.2) + - ggtitle("Figure 2: Barplot showing the proportion of 2021 Census and TPP populations (amended to 2021 grouping) per ethnicity grouped into 5 groups (excluding those\nwithout a recorded ethnicity). Annotated with percentage point difference between 2021 Census and TPP populations.") + + ggtitle("Figure 1: Barplot showing the proportion of 2021 Census and TPP populations (amended to 2021 grouping) per ethnicity grouped into 5 groups (excluding those\nwithout a recorded ethnicity). Annotated with percentage point difference between 2021 Census and TPP populations.") + theme(plot.title = element_text(size = 10)) @@ -584,8 +584,8 @@ my_cols <- setNames(c("","Asian","Black","Mixed", "White","Other","Unknown"),nam df_sus_new_cross_table <- df_sus_new_cross_perc %>% gt( groupname_col = "") %>% - tab_spanner(label="SNOMED: 2022", columns=c(1)) %>% - tab_spanner(label="SUS", columns=c(2:7)) %>% + tab_spanner(label="Primary Care ethnicity", columns=c(1)) %>% + tab_spanner(label="Secondary Care ethnicity", columns=c(2:7)) %>% cols_label(!!!my_cols) %>% tab_style( style = list( @@ -611,7 +611,7 @@ df_sus_new_cross_table <- df_sus_new_cross_perc %>% heading.align = "left" ) %>% tab_header( - title = md("Table 4: Count of patients with a recorded ethnicity in SUS by ethnicity group (proportion of SNOMED:2022 population). All counts are rounded to the nearest 5. "), + title = md("Table 5: Count of patients with a recorded ethnicity in Secondary Care by ethnicity group (proportion of Primary Care population). All counts are rounded to the nearest 5. "), ) @@ -742,8 +742,8 @@ my_cols <- setNames(c("","Asian","Black","Mixed", "White","Other"),names(df_sus_ df_sus_new_cross_known_table <- df_sus_new_cross_known_perc %>% gt( groupname_col = "") %>% - tab_spanner(label="SNOMED: 2022", columns=c(1)) %>% - tab_spanner(label="SUS", columns=c(2:6)) %>% + tab_spanner(label="Primary Care ethnicity", columns=c(1)) %>% + tab_spanner(label="Secondary Care ethnicity", columns=c(2:6)) %>% cols_label(!!!my_cols) %>% tab_style( style = list( @@ -769,7 +769,7 @@ df_sus_new_cross_known_table <- df_sus_new_cross_known_perc %>% heading.align = "left" ) %>% tab_header( - title = md("Table 5: Count of patients with a recorded ethnicity in SUS by ethnicity group excluding Unknown ethnicites (proportion of SNOMED:2022 population). All counts are rounded to the nearest 5. "), + title = md("Table 6: Count of patients with a recorded ethnicity in Secondary Care by ethnicity group excluding Unknown ethnicites (proportion of Primary Care population). All counts are rounded to the nearest 5. "), ) @@ -1111,14 +1111,14 @@ prop_reg_cat_plot <- prop_reg_cat_pivot %>% ggtitle("Figure 1: Barplot showing proportion of registered TPP population with a recorded ethnicity by clinical and demographic subgroups,\nbased on primary care records (solid bars) and when supplemented with secondary care data (pale bars).") -prop_reg_cat_plot - -ggsave( - filename =here::here("output","released","made_locally", "completeness_cat.pdf" - ), - prop_reg_cat_plot, - dpi = 600, - width = 100, - height = 60, - units = "cm" -) +# prop_reg_cat_plot +# +# ggsave( +# filename =here::here("output","released","made_locally", "completeness_cat.pdf" +# ), +# prop_reg_cat_plot, +# dpi = 600, +# width = 100, +# height = 60, +# units = "cm" +# ) diff --git a/analysis/local/local_manuscript_plots.R b/analysis/local/local_manuscript_plots.R index fe2b47b..4a1e723 100644 --- a/analysis/local/local_manuscript_plots.R +++ b/analysis/local/local_manuscript_plots.R @@ -494,25 +494,34 @@ alluvial <- ggplot( ) + geom_alluvium(aes(fill = ethnicity_new_5)) + geom_stratum(aes(fill = ethnicity_sus_5)) + - geom_text(stat = "stratum", aes(label = after_stat(stratum)), colour = "white") + + # geom_text(stat = "stratum", aes(label = after_stat(stratum)), colour = "white",size = 10) + scale_x_discrete(limits = c("ethnicity_new_5", "ethnicity_sus_5"), expand = c(.05, .05), labels = c("ethnicity_new_5" = "Primary Care ethnicity", "ethnicity_sus_5" = "Secondary Care ethnicity"), position = "top") + scale_fill_manual(values = rev(c("#FFD23B", "#808080", "#FF7C00", "#5323B3", "#5A71F3", "#17D7E6")), na.value = NA) + - theme_minimal() + + #theme_minimal() + ggtitle("") + theme( axis.title.y = element_blank(), axis.text.y = element_blank(), axis.ticks.y = element_blank(), - axis.text.x = element_text(size = 15) + axis.text.x = element_text(size = 20) ) + theme( + panel.background = element_rect(fill = "black"), panel.grid.major = element_blank(), panel.grid.minor = element_blank() ) + theme( legend.position = "bottom", legend.title = element_blank() - ) + ) + + geom_label_repel(stat = "stratum", + aes(label = after_stat(stratum), + fill = after_stat(stratum)), + colour = "white", + size = 10, + fontface = 'bold', + direction = "x", + show.legend = F) ggsave( filename = here::here( @@ -527,6 +536,103 @@ ggsave( height = 30, units = "cm" ) + +##### only discrepancy alluvial +df_secondary_new_cross_perc <- df_secondary_new_cross_perc %>% + mutate(ethnicity_new_5_na = case_when(ethnicity_new_5=="Unknown"~ethnicity_sus_5), + ethnicity_new_5_na = fct_relevel( + ethnicity_new_5_na, + "Unknown", "Other", "White", "Mixed", "Black", "Asian" + )) + +alluvial_sus_pick <- ggplot( + as.data.frame(df_secondary_new_cross_perc), + aes(y = `0`, axis1 = ethnicity_new_5, axis2 = ethnicity_sus_5) +) + + geom_alluvium(aes(fill = ethnicity_new_5_na)) + + geom_stratum(aes(fill = ethnicity_sus_5)) + + # geom_text(stat = "stratum", aes(label = after_stat(stratum)), colour = "white",size = 10) + + scale_x_discrete(limits = c("ethnicity_new_5", "ethnicity_sus_5"), expand = c(.05, .05), labels = c("ethnicity_new_5" = "Primary Care ethnicity", "ethnicity_sus_5" = "Secondary Care ethnicity"), position = "top") + + scale_fill_manual(values = c( "#808080", "#FF7C00", "#5323B3", "#5A71F3","#FFD23B", "#17D7E6"), na.value = NA) + + #theme_minimal() + + ggtitle("") + + theme( + axis.title.y = element_blank(), + axis.text.y = element_blank(), + axis.ticks.y = element_blank(), + axis.text.x = element_text(size = 20) + ) + + theme( + panel.background = element_rect(fill = "black"), + panel.grid.major = element_blank(), + panel.grid.minor = element_blank() + ) + + theme( + legend.position = "bottom", + legend.title = element_blank() + ) + +ggsave( + filename = here::here( + "output", + "released", + "made_locally", + glue("secondary_care_alluvial_sus_pick.png") + ), + alluvial_sus_pick, + dpi = 600, + width = 50, + height = 30, + units = "cm" +) + +# alluvial picked up from SUS data +##### only discrepancy alluvial +df_secondary_new_cross_perc <- df_secondary_new_cross_perc %>% + mutate(ethnicity_new_5_na = case_when(ethnicity_new_5=="Unknown" & ethnicity_sus_5 !="Unknown" & ethnicity_new_5!=ethnicity_sus_5~ethnicity_new_5)) + +alluvial_disc <- ggplot( + as.data.frame(df_secondary_new_cross_perc), + aes(y = `0`, axis1 = ethnicity_new_5, axis2 = ethnicity_sus_5) +) + + geom_alluvium(aes(fill = ethnicity_new_5_na)) + + geom_stratum(aes(fill = ethnicity_sus_5)) + + # geom_text(stat = "stratum", aes(label = after_stat(stratum)), colour = "white",size = 10) + + scale_x_discrete(limits = c("ethnicity_new_5", "ethnicity_sus_5"), expand = c(.05, .05), labels = c("ethnicity_new_5" = "Primary Care ethnicity", "ethnicity_sus_5" = "Secondary Care ethnicity"), position = "top") + + scale_fill_manual(values = rev(c("#FFD23B", "#808080", "#FF7C00", "#5323B3", "#5A71F3", "#17D7E6")), na.value = NA) + + #theme_minimal() + + ggtitle("") + + theme( + axis.title.y = element_blank(), + axis.text.y = element_blank(), + axis.ticks.y = element_blank(), + axis.text.x = element_text(size = 20) + ) + + theme( + panel.background = element_rect(fill = "black"), + panel.grid.major = element_blank(), + panel.grid.minor = element_blank() + ) + + theme( + legend.position = "bottom", + legend.title = element_blank() + ) + +ggsave( + filename = here::here( + "output", + "released", + "made_locally", + glue("secondary_care_alluvial_disc.png") + ), + alluvial_disc, + dpi = 600, + width = 50, + height = 30, + units = "cm" +) + + secondary_heat_perc_all_patients <- ggplot(df_secondary_new_cross_perc, aes(ethnicity_sus_5, ethnicity_new_5, fill = percentage)) + geom_tile() + @@ -673,16 +779,15 @@ alluvial_frequent <- ggplot( ) + geom_alluvium(aes(fill = latest)) + geom_stratum(aes(fill = common)) + - geom_text(stat = "stratum", aes(label = after_stat(stratum)), colour = "white") + scale_x_discrete(limits = c("latest", "common"), expand = c(.05, .05), labels = c("latest" = "Latest recorded ethnicity", "common" = "Most frequently recorded ethnicity"), position = "top") + scale_fill_manual(values = rev(c("#FFD23B", "#FF7C00", "#5323B3", "#5A71F3", "#17D7E6")), na.value = NA) + - theme_minimal() + + # theme_minimal() + ggtitle("") + theme( axis.title.y = element_blank(), axis.text.y = element_blank(), axis.ticks.y = element_blank(), - axis.text.x = element_text(size = 15) + axis.text.x = element_text(size = 20) ) + theme( panel.grid.major = element_blank(), @@ -691,7 +796,15 @@ alluvial_frequent <- ggplot( theme( legend.position = "bottom", legend.title = element_blank() - ) + ) + + geom_label_repel(stat = "stratum", + aes(label = after_stat(stratum), + fill = after_stat(stratum)), + colour = "white", + size = 10, + fontface = 'bold', + direction = "x", + show.legend = F) ggsave( filename = here::here( diff --git a/analysis/sus/lib_phenotype_validation_sus.py b/analysis/sus/lib_phenotype_validation_sus.py index be9c87e..ab9e975 100644 --- a/analysis/sus/lib_phenotype_validation_sus.py +++ b/analysis/sus/lib_phenotype_validation_sus.py @@ -39,7 +39,7 @@ def import_clean( ): # Import df_import = pd.read_feather(input_path) - + # Check whether output paths exist or not, create if missing path_tables = f"output/{output_path}/{grouping}/tables/" path_figures = f"output/{output_path}/{grouping}/figures/" @@ -142,9 +142,7 @@ def import_clean( li_col_filled = [col for col in df_clean.columns if col.endswith("_filled")] li_col_missing = [col for col in df_clean.columns if col.endswith("_missing")] - df_clean["any_filled"] = ( - df_clean[li_col_filled].sum(axis=1) > 0 - ).astype(int) + df_clean["any_filled"] = (df_clean[li_col_filled].sum(axis=1) > 0).astype(int) df_clean["all_missing"] = ( df_clean[li_col_missing].sum(axis=1) == len(definitions) @@ -158,25 +156,40 @@ def import_clean( def simple_latest_common_comparison( - df_clean, definitions,reg, other_vars, output_path,grouping,code_dict="", missing_check=False + df_clean, + definitions, + reg, + other_vars, + output_path, + grouping, + code_dict="", + missing_check=False, ): for definition in definitions: if missing_check: df_clean = df_clean[df_clean[f"{definition}_date"] == "1900-01-01"] vars = [s for s in other_vars if s.startswith(definition)] df_subset = df_clean.loc[~df_clean[definition].isna()] - df_subset = df_subset[df_subset[definition].isin(code_dict[definition].values())] + df_subset = df_subset[ + df_subset[definition].isin(code_dict[definition].values()) + ] df_subset = df_subset[[definition] + vars].set_index(definition) df_subset = df_subset.replace(0, np.nan) # reorder columns - col_arrange = [f"{definition}_asian",f"{definition}_black",f"{definition}_mixed",f"{definition}_white",f"{definition}_other"] - df_subset=df_subset[col_arrange] + col_arrange = [ + f"{definition}_asian", + f"{definition}_black", + f"{definition}_mixed", + f"{definition}_white", + f"{definition}_other", + ] + df_subset = df_subset[col_arrange] # find column with first instance of the maximum value - df_subset['max'] = df_subset.astype(float).idxmax(axis=1) + df_subset["max"] = df_subset.astype(float).idxmax(axis=1) df_subset2 = df_subset # returning 1 for the first column that contains the highest value and 0 for everything else for col in df_subset2.columns: - df_subset2[col] = np.where(df_subset2['max'] == col, 1, 0) + df_subset2[col] = np.where(df_subset2["max"] == col, 1, 0) # drop max column df_subset2 = df_subset2[col_arrange] df_sum = redact_round_table(df_subset2.groupby(definition).sum()) @@ -189,8 +202,9 @@ def simple_latest_common_comparison( f"output/{output_path}/{grouping}/tables/simple_latest_common_{definition}_{reg}.csv" ) + def simple_state_change( - df_clean, definitions,reg, other_vars, output_path,grouping, missing_check=False + df_clean, definitions, reg, other_vars, output_path, grouping, missing_check=False ): for definition in definitions: if missing_check: @@ -203,13 +217,15 @@ def simple_state_change( .reset_index() ) ### sum all 'vars' which are not null - df_subset[f"{definition}_any"]=df_subset[vars].notnull().sum(axis=1) - ### all px with a latest ethnicity must have at least 1 defined ethnicity + df_subset[f"{definition}_any"] = df_subset[vars].notnull().sum(axis=1) + ### all px with a latest ethnicity must have at least 1 defined ethnicity ### If they only have 1 recorded ethnicity this must equal the latest ethnicity ### replace 1 with NULL and count all with over one recorded ethnicity (i.e. latest plus another ethnicity) - df_subset[f"{definition}_any"] = df_subset[f"{definition}_any"].replace(1, np.nan) + df_subset[f"{definition}_any"] = df_subset[f"{definition}_any"].replace( + 1, np.nan + ) df_subset["n"] = 1 - ### check if any px have latest ethnicity but no recorded ethnicity (this should be impossible!) + ### check if any px have latest ethnicity but no recorded ethnicity (this should be impossible!) # df_any_check=df_subset # df_any_check[f"{definition}_any_check"]=df_any_check[f"{definition}_any"]==0 # df_any_check[f"{definition}_any_check"]=df_any_check[f"{definition}_any_check"].replace(False, np.nan) @@ -260,9 +276,17 @@ def simple_patient_counts( for definition in definitions: df_clean.loc[df_clean[definition] == x, f"{x}_{definition}_filled"] = 1 li_cat_def.append(f"{x}_{definition}") - df_clean[f"{x}_any"]=(df_clean[df_clean.filter(regex=f'{x}').columns].sum(axis=1) > 0).astype(int) + df_clean[f"{x}_any"] = ( + df_clean[df_clean.filter(regex=f"{x}").columns].sum(axis=1) > 0 + ).astype(int) # Assumes definition[1] is sus and definition[0] is the primary codelist - df_clean[f"{x}_supplemented"] = ((df_clean[f"{x}_{definitions[0]}_filled"] == 1 )| (df_clean[definitions[0]].isnull() & (df_clean[f"{x}_{definitions[1]}_filled"]==1 ))).astype(int) + df_clean[f"{x}_supplemented"] = ( + (df_clean[f"{x}_{definitions[0]}_filled"] == 1) + | ( + df_clean[definitions[0]].isnull() + & (df_clean[f"{x}_{definitions[1]}_filled"] == 1) + ) + ).astype(int) definitions = li_cat_def li_cat_any = [x + "_any" for x in li_cat] li_cat_supplemented = [x + "_supplemented" for x in li_cat] @@ -276,16 +300,20 @@ def simple_patient_counts( .set_index("patient_id") ) li_filled.append(df_temp) - if categories == True: + if categories == True: df_temp = ( - df_clean[["patient_id", "all_filled", "all_missing","any_filled"]+ li_cat_any + li_cat_supplemented] + df_clean[ + ["patient_id", "all_filled", "all_missing", "any_filled"] + + li_cat_any + + li_cat_supplemented + ] .drop_duplicates() .dropna() .set_index("patient_id") ) else: df_temp = ( - df_clean[["patient_id", "all_filled", "all_missing","any_filled"]] + df_clean[["patient_id", "all_filled", "all_missing", "any_filled"]] .drop_duplicates() .dropna() .set_index("patient_id") @@ -312,9 +340,13 @@ def simple_patient_counts( .reset_index(drop=True) ) li_filled_group.append(df_temp) - if categories == True: + if categories == True: df_temp = ( - df_clean[["patient_id", "all_filled", "all_missing","any_filled",group] + li_cat_any + li_cat_supplemented] + df_clean[ + ["patient_id", "all_filled", "all_missing", "any_filled", group] + + li_cat_any + + li_cat_supplemented + ] .drop_duplicates() .dropna() .reset_index(drop=True) @@ -322,7 +354,9 @@ def simple_patient_counts( li_filled_group.append(df_temp) else: df_temp = ( - df_clean[["patient_id", "all_filled", "all_missing","any_filled",group]] + df_clean[ + ["patient_id", "all_filled", "all_missing", "any_filled", group] + ] .drop_duplicates() .dropna() .reset_index(drop=True) @@ -359,10 +393,18 @@ def simple_patient_counts( f"output/{output_path}/{grouping}/tables/simple_patient_counts_categories_{grouping}_{reg}.csv" ) else: - df_append.to_csv(f"output/{output_path}/{grouping}/tables/simple_patient_counts_{grouping}_{reg}.csv") + df_append.to_csv( + f"output/{output_path}/{grouping}/tables/simple_patient_counts_{grouping}_{reg}.csv" + ) -def upset(df_clean, output_path, comparator_1, comparator_2,grouping,): +def upset( + df_clean, + output_path, + comparator_1, + comparator_2, + grouping, +): # create csv for output checking upset_output_check = df_clean[[comparator_1, comparator_2]] upset_output_check[comparator_1] = df_clean[comparator_1].fillna("Unknown") @@ -384,10 +426,19 @@ def upset(df_clean, output_path, comparator_1, comparator_2,grouping,): ) upset.plot() - plt.savefig(f"output/{output_path}/{grouping}/figures/upset_{comparator_1}_{comparator_2}.png") + plt.savefig( + f"output/{output_path}/{grouping}/figures/upset_{comparator_1}_{comparator_2}.png" + ) -def upset_cat(df_clean, output_path, comparator_1, comparator_2, other_vars,grouping,): +def upset_cat( + df_clean, + output_path, + comparator_1, + comparator_2, + other_vars, + grouping, +): upset_cat_df = pd.DataFrame(df_clean[comparator_1]) for definition in [comparator_1, comparator_2]: for var in other_vars: @@ -419,142 +470,252 @@ def upset_cat(df_clean, output_path, comparator_1, comparator_2, other_vars,grou f"output/{output_path}/{grouping}/figures/upset_category_{comparator_1}_{comparator_2}.png" ) -def records_over_time(df_clean, definitions, demographic_covariates, clinical_covariates, output_path, filepath,grouping,reg): + +def records_over_time( + df_clean, + definitions, + demographic_covariates, + clinical_covariates, + output_path, + filepath, + grouping, + reg, +): """ Count the number of records over time - + Arguments: df_clean: a dataframe that has been cleaned using import_clean() definitions: a list of derived variables to be evaluated - demographic_covariates: a list of demographic covariates + demographic_covariates: a list of demographic covariates clinical_covariates: a list of clinical covariates output_path: filepath to the output folder filepath: filepath to the output file - + Returns: .csv file (underlying data) .png file (line plot) """ li_df = [] for definition in definitions: - df_grouped = df_clean[[definition+'_date',definition]].groupby( - definition+'_date' - ).count().reset_index().rename(columns={definition+'_date':'date'}).set_index('date') + df_grouped = ( + df_clean[[definition + "_date", definition]] + .groupby(definition + "_date") + .count() + .reset_index() + .rename(columns={definition + "_date": "date"}) + .set_index("date") + ) li_df.append(redact_round_table(df_grouped)) - df_all_time = pd.concat(li_df).stack().reset_index().rename(columns={'level_1':'variable',0:'value'}) - del li_df - + df_all_time = ( + pd.concat(li_df) + .stack() + .reset_index() + .rename(columns={"level_1": "variable", 0: "value"}) + ) + del li_df + fig, ax = plt.subplots(figsize=(12, 8)) fig.autofmt_xdate() - sns.lineplot(x = 'date', y = 'value', hue='variable', data = df_all_time, ax=ax).set_title('New records by month') - ax.legend().set_title('') + sns.lineplot( + x="date", y="value", hue="variable", data=df_all_time, ax=ax + ).set_title("New records by month") + ax.legend().set_title("") if len(df_all_time) > 0: - df_all_time.to_csv(f'output/{output_path}/{grouping}/tables/records_over_time{filepath}_{reg}.csv') - plt.savefig(f'output/{output_path}/{grouping}/figures/records_over_time{filepath}_{reg}.png') + df_all_time.to_csv( + f"output/{output_path}/{grouping}/tables/records_over_time{filepath}_{reg}.csv" + ) + plt.savefig( + f"output/{output_path}/{grouping}/figures/records_over_time{filepath}_{reg}.png" + ) for group in demographic_covariates + clinical_covariates: for definition in definitions: - df_grouped = df_clean[[definition+'_date',definition,group]].groupby( - [definition+'_date',group]).count().reset_index().rename(columns={definition+'_date':'date'}).set_index(['date', group]) - df_time=redact_round_table(df_grouped).reset_index() + df_grouped = ( + df_clean[[definition + "_date", definition, group]] + .groupby([definition + "_date", group]) + .count() + .reset_index() + .rename(columns={definition + "_date": "date"}) + .set_index(["date", group]) + ) + df_time = redact_round_table(df_grouped).reset_index() fig, ax = plt.subplots(figsize=(12, 8)) fig.autofmt_xdate() - sns.lineplot(x = 'date', y = definition, hue=group, data = df_time, ax=ax).set_title(f'{definition} recorded by {group} and month') - ax.legend().set_title('') + sns.lineplot( + x="date", y=definition, hue=group, data=df_time, ax=ax + ).set_title(f"{definition} recorded by {group} and month") + ax.legend().set_title("") if len(df_time) > 0: - df_time.to_csv(f'output/{output_path}/{grouping}/tables/records_over_time_{definition}_{group}{filepath}_{reg}.csv') - plt.savefig(f'output/{output_path}/{grouping}/figures/records_over_time_{definition}_{group}{filepath}_{reg}.png') - -def records_over_time_perc(df_clean, definitions, demographic_covariates, clinical_covariates, output_path, filepath,grouping,reg): + df_time.to_csv( + f"output/{output_path}/{grouping}/tables/records_over_time_{definition}_{group}{filepath}_{reg}.csv" + ) + plt.savefig( + f"output/{output_path}/{grouping}/figures/records_over_time_{definition}_{group}{filepath}_{reg}.png" + ) + + +def records_over_time_perc( + df_clean, + definitions, + demographic_covariates, + clinical_covariates, + output_path, + filepath, + grouping, + reg, +): """ Count the number of records over time as a percentage of all records - + Arguments: df_clean: a dataframe that has been cleaned using import_clean() definitions: a list of derived variables to be evaluated - demographic_covariates: a list of demographic covariates + demographic_covariates: a list of demographic covariates clinical_covariates: a list of clinical covariates output_path: filepath to the output folder filepath: filepath to the output file - + Returns: .csv file (underlying data) .png file (line plot) """ li_df = [] for definition in definitions: - df_grouped = df_clean[[definition+'_date',definition]].groupby( - definition+'_date' - ).count().reset_index().rename(columns={definition+'_date':'date'}).set_index('date') + df_grouped = ( + df_clean[[definition + "_date", definition]] + .groupby(definition + "_date") + .count() + .reset_index() + .rename(columns={definition + "_date": "date"}) + .set_index("date") + ) li_df.append(redact_round_table(df_grouped)) - df_all_time = pd.concat(li_df).stack().reset_index().rename(columns={'level_1':'variable',0:'value'}) - df_all_time['sum']=df_all_time.groupby('variable', sort=False)['value'].transform('sum') - df_all_time['value']=df_all_time['value']/df_all_time['sum'] - del li_df - + df_all_time = ( + pd.concat(li_df) + .stack() + .reset_index() + .rename(columns={"level_1": "variable", 0: "value"}) + ) + df_all_time["sum"] = df_all_time.groupby("variable", sort=False)["value"].transform( + "sum" + ) + df_all_time["value"] = df_all_time["value"] / df_all_time["sum"] + del li_df + fig, ax = plt.subplots(figsize=(12, 8)) fig.autofmt_xdate() - sns.lineplot(x = 'date', y = 'value', hue='variable', data = df_all_time, ax=ax).set_title('New records by month') - ax.legend().set_title('') + sns.lineplot( + x="date", y="value", hue="variable", data=df_all_time, ax=ax + ).set_title("New records by month") + ax.legend().set_title("") if len(df_all_time) > 0: - df_all_time.to_csv(f'output/{output_path}/{grouping}/tables/records_over_time{filepath}_{reg}.csv') - plt.savefig(f'output/{output_path}/{grouping}/figures/records_over_time{filepath}_{reg}.png') + df_all_time.to_csv( + f"output/{output_path}/{grouping}/tables/records_over_time{filepath}_{reg}.csv" + ) + plt.savefig( + f"output/{output_path}/{grouping}/figures/records_over_time{filepath}_{reg}.png" + ) for group in demographic_covariates + clinical_covariates: for definition in definitions: - df_grouped = df_clean[[definition+'_date',definition,group]].groupby( - [definition+'_date',group]).count().reset_index().rename(columns={definition+'_date':'date'}).set_index(['date', group]) - df_time=redact_round_table(df_grouped).reset_index() - df_time['sum']=df_time.groupby(group, sort=False)[definition].transform('sum') - df_time[definition]=df_time[definition]/df_time['sum']*100 + df_grouped = ( + df_clean[[definition + "_date", definition, group]] + .groupby([definition + "_date", group]) + .count() + .reset_index() + .rename(columns={definition + "_date": "date"}) + .set_index(["date", group]) + ) + df_time = redact_round_table(df_grouped).reset_index() + df_time["sum"] = df_time.groupby(group, sort=False)[definition].transform( + "sum" + ) + df_time[definition] = df_time[definition] / df_time["sum"] * 100 fig, ax = plt.subplots(figsize=(12, 8)) fig.autofmt_xdate() - sns.lineplot(x = 'date', y = definition, hue=group, data = df_time, ax=ax).set_title(f'{definition} recorded by {group} and month') - ax.legend().set_title('') + sns.lineplot( + x="date", y=definition, hue=group, data=df_time, ax=ax + ).set_title(f"{definition} recorded by {group} and month") + ax.legend().set_title("") if len(df_time) > 0: - df_time.to_csv(f'output/{output_path}/{grouping}/tables/records_over_time_{definition}_{group}{filepath}_{reg}.csv') - plt.savefig(f'output/{output_path}/{grouping}/figures/records_over_time_{definition}_{group}{filepath}_{reg}.png') - + df_time.to_csv( + f"output/{output_path}/{grouping}/tables/records_over_time_{definition}_{group}{filepath}_{reg}.csv" + ) + plt.savefig( + f"output/{output_path}/{grouping}/figures/records_over_time_{definition}_{group}{filepath}_{reg}.png" + ) + def display_heatmap(df_clean, definitions, output_path): # All with measurement li_filled = [] for definition in definitions: - df_temp = df_clean[['patient_id']].drop_duplicates().set_index('patient_id') - df_temp[definition+'_filled'] = 1 - df_temp = df_clean[['patient_id', definition+'_filled']].drop_duplicates().dropna().set_index('patient_id') + df_temp = df_clean[["patient_id"]].drop_duplicates().set_index("patient_id") + df_temp[definition + "_filled"] = 1 + df_temp = ( + df_clean[["patient_id", definition + "_filled"]] + .drop_duplicates() + .dropna() + .set_index("patient_id") + ) li_filled.append(df_temp) # Prepare data for heatmap input df_temp2 = pd.concat(li_filled, axis=1) # Remove list from memory - del li_filled - df_transform = df_temp2.replace(np.nan,0) + del li_filled + df_transform = df_temp2.replace(np.nan, 0) df_dot = redact_round_table(df_transform.T.dot(df_transform)) - + # Create mask to eliminate duplicates in heatmap mask = np.triu(np.ones_like(df_dot)) np.fill_diagonal(mask[::1], 0) # Draw the heatmap with the mask fig, ax = plt.subplots(figsize=(12, 8)) - sns.heatmap(df_dot, annot=True, mask=mask, fmt='g', cmap="YlGnBu", vmin=0) - #plt.show() - plt.savefig(f'output/{output_path}/figures/heatmap.png') - -def simple_sus_crosstab( - df_clean, output_path,grouping,reg -): - + sns.heatmap(df_dot, annot=True, mask=mask, fmt="g", cmap="YlGnBu", vmin=0) + # plt.show() + plt.savefig(f"output/{output_path}/figures/heatmap.png") + + +def simple_sus_crosstab(df_clean, output_path, grouping, reg): df_clean.ethnicity_new_5 = df_clean.ethnicity_new_5.fillna(" Unknown") df_clean.ethnicity_sus_5 = df_clean.ethnicity_sus_5.fillna(" Unknown") - data_crosstab = pd.crosstab(df_clean.ethnicity_new_5, - df_clean.ethnicity_sus_5, margins = False) - data_crosstab=redact_round_table(data_crosstab) - data_crosstab.to_csv(f"output/{output_path}/{grouping}/tables/simple_sus_crosstab_{reg}.csv") - - df_clean=df_clean[["ethnicity_new_5","ethnicity_sus_5"]] - data_crosstab_long = pd.DataFrame(df_clean.groupby(["ethnicity_new_5","ethnicity_sus_5"]).size()) - data_crosstab_long=redact_round_table(data_crosstab_long) - data_crosstab_long.to_csv(f"output/{output_path}/{grouping}/tables/simple_sus_crosstab_long_{reg}.csv") - \ No newline at end of file + data_crosstab = pd.crosstab( + df_clean.ethnicity_new_5, df_clean.ethnicity_sus_5, margins=False + ) + data_crosstab = redact_round_table(data_crosstab) + data_crosstab.to_csv( + f"output/{output_path}/{grouping}/tables/simple_sus_crosstab_{reg}.csv" + ) + + df_clean = df_clean[["ethnicity_new_5", "ethnicity_sus_5"]] + data_crosstab_long = pd.DataFrame( + df_clean.groupby(["ethnicity_new_5", "ethnicity_sus_5"]).size() + ) + data_crosstab_long = redact_round_table(data_crosstab_long) + data_crosstab_long.to_csv( + f"output/{output_path}/{grouping}/tables/simple_sus_crosstab_long_{reg}.csv" + ) + + +def simple_ctv3_sus_crosstab(df_clean, output_path, grouping, reg): + df_clean.ethnicity_5 = df_clean.ethnicity_5.fillna(" Unknown") + df_clean.ethnicity_sus_5 = df_clean.ethnicity_sus_5.fillna(" Unknown") + data_crosstab = pd.crosstab( + df_clean.ethnicity_5, df_clean.ethnicity_sus_5, margins=False + ) + data_crosstab = redact_round_table(data_crosstab) + data_crosstab.to_csv( + f"output/{output_path}/{grouping}/tables/simple_sus_crosstab_{reg}.csv" + ) + + df_clean = df_clean[["ethnicity_5", "ethnicity_sus_5"]] + data_crosstab_long = pd.DataFrame( + df_clean.groupby(["ethnicity_5", "ethnicity_sus_5"]).size() + ) + data_crosstab_long = redact_round_table(data_crosstab_long) + data_crosstab_long.to_csv( + f"output/{output_path}/{grouping}/tables/simple_ctv3_sus_crosstab_long_{reg}.csv" + ) diff --git a/analysis/sus/simple_validation_script_ctv3_sus.py b/analysis/sus/simple_validation_script_ctv3_sus.py new file mode 100644 index 0000000..ec911e8 --- /dev/null +++ b/analysis/sus/simple_validation_script_ctv3_sus.py @@ -0,0 +1,142 @@ +from lib_phenotype_validation_sus import * + +############################ CONFIGURE OPTIONS HERE ################################ + +# Import file +input_path = "output/extract_5/input_5.feather" + +# Definitions +definitions = [ + "ethnicity_5", + "ethnicity_sus_5", +] + +# Code dictionary +code_dict = { + "imd": { + 0: "Unknown", + 1: "1 Most deprived", + 2: "2", + 3: "3", + 4: "4", + 5: "5 Least deprived", + }, + "ethnicity_new_5": {1: "White", 2: "Mixed", 3: "Asian", 4: "Black", 5: "Other"}, + "ethnicity_sus_5": {1: "White", 2: "Mixed", 3: "Asian", 4: "Black", 5: "Other"}, +} + +# Other variables to include +other_vars = ["white", "mixed", "asian", "black", "other"] +other_vars_combined = [x + "_" + y for x in definitions for y in other_vars] + +# Restrict to registered as of index date +registered = True +reg = "fullset" +if registered == True: + reg = "registered" + +# Dates +dates = False +date_min = "" +date_max = "" +time_delta = "" + +# Min/max range +min_range = 4 +max_range = 200 + +# Null value – could be multiple values in a list [0,'0',NA] +null = [0, "0"] + +# Covariates +demographic_covariates = ["age_band", "sex", "region", "imd"] +clinical_covariates = ["dementia", "diabetes", "hypertension", "learning_disability"] + +# Output filepath +output_path = "sus/simplified_output" +grouping = "5_group" + +########################## SPECIFY ANALYSES TO RUN HERE ############################## + + +def main(): + # combine defintions and other_vars + df_clean = import_clean( + input_path, + definitions, + other_vars_combined, + demographic_covariates, + clinical_covariates, + reg, + null, + date_min, + date_max, + time_delta, + output_path, + grouping, + code_dict, + dates=False, + registered=registered, + dates_check=False, + ) + # Count patients with records + simple_patient_counts( + df_clean, + definitions, + reg, + demographic_covariates, + clinical_covariates, + output_path, + grouping, + ) + + # Count patients by categories + simple_patient_counts( + df_clean, + definitions, + reg, + demographic_covariates, + clinical_covariates, + output_path, + grouping, + categories=True, + ) + + simple_sus_crosstab(df_clean, output_path, grouping, reg) + + # Latest v most common + simple_latest_common_comparison( + df_clean, + definitions, + reg, + other_vars_combined, + output_path, + grouping, + code_dict, + ) + + # State change + simple_state_change( + df_clean, + definitions, + reg, + other_vars_combined, + output_path, + grouping, + ) + # # records over time + # records_over_time_perc( + # df_clean, definitions, demographic_covariates, clinical_covariates, output_path, "",grouping,reg + # ) + + +########################## DO NOT EDIT – RUNS SCRIPT ############################## + +if __name__ == "__main__": + main() + +# registered = False +# reg = "fullset" + +# if __name__ == "__main__": +# main() diff --git a/project.yaml b/project.yaml index ceb3801..2f1e286 100644 --- a/project.yaml +++ b/project.yaml @@ -90,6 +90,14 @@ actions: tables: output/sus/simplified_output/5_group/tables/*.csv # figures: output/sus/simplified_output/5_group/figures/*.png + execute_simple_validation_analyses_ctv3_sus: + run: python:latest python analysis/sus/simple_validation_script_sus.py + needs: [generate_study_population_5] + outputs: + moderately_sensitive: + tables: output/sus/simplified_output/5_group/tables/*.csv +# figures: output/sus/simplified_output/5_group/figures/*.png + execute_simple_validation_analyses_16_sus: run: python:latest python analysis/sus/simple_validation_script_16_sus.py needs: [generate_study_population_16]