Improve date handling

opensafely · Feb 7, 2023 · 4d7793e · 4d7793e
1 parent 52c56af
commit 4d7793e
Show file tree

Hide file tree

Showing 18 changed files with 634 additions and 192 deletions.
diff --git a/analysis/codelists.py b/analysis/codelists.py
@@ -746,4 +746,18 @@
     anxiety_icd10,
     ocd_icd10,
     ptsd_icd10
+)
+
+# COCP
+cocp_dmd = codelist_from_csv(
+    "codelists/user-elsie_horne-cocp_dmd.csv",
+    system="snomed",
+    column="dmd_id",
+)
+
+# HRT
+hrt_dmd = codelist_from_csv(
+    "codelists/user-elsie_horne-hrt_dmd.csv",
+    system="snomed",
+    column="dmd_id",
 )
diff --git a/analysis/common_variables.py b/analysis/common_variables.py
diff --git a/analysis/create_project_actions.R b/analysis/create_project_actions.R
@@ -237,6 +237,17 @@ actions_list <- splice(
     )
   ),
 
+  comment("Implement study_definition for unvax_extf"),
+
+  action(
+    name = "generate_study_population_unvax_extf",
+    run = "cohortextractor:latest generate_cohort --study-definition study_definition_unvax_extf --output-format csv.gz",
+    needs = list("vax_eligibility_inputs","generate_index_dates"),
+    highly_sensitive = list(
+      cohort = glue("output/input_unvax_extf.csv.gz")
+    )
+  ),
+
   comment("Preprocess data - prevax"),
 
   action(
@@ -301,12 +312,28 @@ actions_list <- splice(
     )
   ),
 
+  comment("Preprocess data - unvax_extf"),
+
+  action(
+    name = "preprocess_data_unvax_extf",
+    run = "r:latest analysis/preprocess_data.R unvax_extf",
+    needs = list("generate_index_dates", "generate_study_population_unvax_extf"),
+    moderately_sensitive = list(
+      describe = glue("output/describe_input_unvax_extf_stage0.txt"),
+      describe_venn = glue("output/describe_venn_unvax_extf.txt")
+    ),
+    highly_sensitive = list(
+      cohort = glue("output/input_unvax_extf.rds"),
+      venn = glue("output/venn_unvax_extf.rds")
+    )
+  ),
+
   comment("Data cleaning - all cohorts"),
 
   action(
     name = "stage1_data_cleaning_all",
     run = "r:latest analysis/stage1_data_cleaning.R all",
-    needs = list("preprocess_data_prevax","preprocess_data_prevax_extf","preprocess_data_vax", "preprocess_data_unvax","vax_eligibility_inputs"),
+    needs = list("preprocess_data_prevax","preprocess_data_prevax_extf","preprocess_data_vax", "preprocess_data_unvax", "preprocess_data_unvax_extf"),
     moderately_sensitive = list(
       refactoring = glue("output/meta_data_factors_*.csv"),
       QA_rules = glue("output/QA_summary_*.csv"),
@@ -318,41 +345,41 @@ actions_list <- splice(
     )
   ),
 
-  action(
-    name = glue("describe_file-input_prevax_stage1"),
-    run = glue("r:latest analysis/describe_file.R input_prevax_stage1 rds"),
-    needs = list("stage1_data_cleaning_all"),
-    moderately_sensitive = list(
-      describe_model_input = glue("output/describe-input_prevax_stage1.txt")
-    )
-  ),
-
-  action(
-    name = glue("describe_file-input_prevax_extf_stage1"),
-    run = glue("r:latest analysis/describe_file.R input_prevax_extf_stage1 rds"),
-    needs = list("stage1_data_cleaning_all"),
-    moderately_sensitive = list(
-      describe_model_input = glue("output/describe-input_prevax_extf_stage1.txt")
-    )
-  ),
-
-  action(
-    name = glue("describe_file-input_vax_stage1"),
-    run = glue("r:latest analysis/describe_file.R input_vax_stage1 rds"),
-    needs = list("stage1_data_cleaning_all"),
-    moderately_sensitive = list(
-      describe_model_input = glue("output/describe-input_vax_stage1.txt")
-    )
-  ),
-
-  action(
-    name = glue("describe_file-input_unvax_stage1"),
-    run = glue("r:latest analysis/describe_file.R input_unvax_stage1 rds"),
-    needs = list("stage1_data_cleaning_all"),
-    moderately_sensitive = list(
-      describe_model_input = glue("output/describe-input_unvax_stage1.txt")
-    )
-  ),
+  # action(
+  #   name = glue("describe_file-input_prevax_stage1"),
+  #   run = glue("r:latest analysis/describe_file.R input_prevax_stage1 rds"),
+  #   needs = list("stage1_data_cleaning_all"),
+  #   moderately_sensitive = list(
+  #     describe_model_input = glue("output/describe-input_prevax_stage1.txt")
+  #   )
+  # ),
+  # 
+  # action(
+  #   name = glue("describe_file-input_prevax_extf_stage1"),
+  #   run = glue("r:latest analysis/describe_file.R input_prevax_extf_stage1 rds"),
+  #   needs = list("stage1_data_cleaning_all"),
+  #   moderately_sensitive = list(
+  #     describe_model_input = glue("output/describe-input_prevax_extf_stage1.txt")
+  #   )
+  # ),
+  # 
+  # action(
+  #   name = glue("describe_file-input_vax_stage1"),
+  #   run = glue("r:latest analysis/describe_file.R input_vax_stage1 rds"),
+  #   needs = list("stage1_data_cleaning_all"),
+  #   moderately_sensitive = list(
+  #     describe_model_input = glue("output/describe-input_vax_stage1.txt")
+  #   )
+  # ),
+  # 
+  # action(
+  #   name = glue("describe_file-input_unvax_stage1"),
+  #   run = glue("r:latest analysis/describe_file.R input_unvax_stage1 rds"),
+  #   needs = list("stage1_data_cleaning_all"),
+  #   moderately_sensitive = list(
+  #     describe_model_input = glue("output/describe-input_unvax_stage1.txt")
+  #   )
+  # ),
 
   # comment("Stage 2 - Missing - Table 1 - all cohorts"),
   # 

diff --git a/analysis/make_model_input.R b/analysis/make_model_input.R
@@ -55,7 +55,8 @@ for (i in 1:nrow(active_analyses)) {
 
   input <- input[,unique(c("patient_id",
                            "index_date",
-                           "end_date",
+                           "end_date_exposure",
+                           "end_date_outcome",
                            active_analyses$exposure[i], 
                            active_analyses$outcome[i],
                            unlist(strsplit(active_analyses$strata[i], split = ";")),
@@ -75,60 +76,17 @@ for (i in 1:nrow(active_analyses)) {
                          "exp_date" = active_analyses$exposure[i])
 
   input <- input %>% 
-    dplyr::mutate(out_date = replace(out_date, which(out_date>end_date | out_date<index_date), NA),
-                  exp_date =  replace(exp_date, which(exp_date>end_date | exp_date<index_date), NA),
+    dplyr::mutate(out_date = replace(out_date, which(out_date>end_date_outcome | out_date<index_date), NA),
+                  exp_date =  replace(exp_date, which(exp_date>end_date_exposure | exp_date<index_date), NA),
                   sub_cat_covid19_hospital = replace(sub_cat_covid19_hospital, which(is.na(exp_date)),"no_infection"))
 
   # Update end date to be outcome date where applicable ------------------------
   print('Update end date to be outcome date where applicable')
 
   input <- input %>% 
     dplyr::rowwise() %>% 
-    dplyr::mutate(end_date = min(end_date, out_date, na.rm = TRUE))
-
-  # # Make three level history covariates ----------------------------------------
-  # print('Make three level history covariates')
-  # 
-  # input$cov_cat_priorhistory_depression <- dplyr::case_when(
-  #   input$cov_bin_history_depression==TRUE & input$cov_bin_recent_depression==TRUE ~ "recent",
-  #   input$cov_bin_history_depression==TRUE & input$cov_bin_recent_depression==FALSE ~ "notrecent",
-  #   input$cov_bin_history_depression==FALSE & input$cov_bin_recent_depression==TRUE ~ "recent",
-  #   input$cov_bin_history_depression==FALSE & input$cov_bin_recent_depression==FALSE ~ "none")
-  # input[,c("cov_bin_history_depression","cov_bin_recent_depression")] <- NULL
-  # input$cov_cat_priorhistory_depression <- as.factor(input$cov_cat_priorhistory_depression)
-  # 
-  # input$cov_cat_priorhistory_anxiety_general <- dplyr::case_when(
-  #       input$cov_bin_history_anxiety==TRUE & input$cov_bin_recent_anxiety==TRUE ~ "recent",
-  #       input$cov_bin_history_anxiety==TRUE & input$cov_bin_recent_anxiety==FALSE ~ "notrecent",
-  #       input$cov_bin_history_anxiety==FALSE & input$cov_bin_recent_anxiety==TRUE ~ "recent",
-  #       input$cov_bin_history_anxiety==FALSE & input$cov_bin_recent_anxiety==FALSE ~ "none")
-  # input[,c("cov_bin_history_anxiety","cov_bin_recent_anxiety")] <- NULL
-  # input$cov_cat_priorhistory_anxiety_general <- as.factor(input$cov_cat_priorhistory_anxiety_general)
-  # 
-  # input$cov_cat_priorhistory_eating_disorders <- dplyr::case_when(
-  #       input$cov_bin_history_eating_disorders==TRUE & input$cov_bin_recent_eating_disorders==TRUE ~ "recent",
-  #       input$cov_bin_history_eating_disorders==TRUE & input$cov_bin_recent_eating_disorders==FALSE ~ "notrecent",
-  #       input$cov_bin_history_eating_disorders==FALSE & input$cov_bin_recent_eating_disorders==TRUE ~ "recent",
-  #       input$cov_bin_history_eating_disorders==FALSE & input$cov_bin_recent_eating_disorders==FALSE ~ "none")
-  # input[,c("cov_bin_history_eating_disorders","cov_bin_recent_eating_disorders")] <- NULL
-  # input$cov_cat_priorhistory_eating_disorders <- as.factor(input$cov_cat_priorhistory_eating_disorders)
-  # 
-  # input$cov_cat_priorhistory_serious_mental_illness <- dplyr::case_when(
-  #       input$cov_bin_history_serious_mental_illness==TRUE & input$cov_bin_recent_serious_mental_illness==TRUE ~ "recent",
-  #       input$cov_bin_history_serious_mental_illness==TRUE & input$cov_bin_recent_serious_mental_illness==FALSE ~ "notrecent",
-  #       input$cov_bin_history_serious_mental_illness==FALSE & input$cov_bin_recent_serious_mental_illness==TRUE ~ "recent",
-  #       input$cov_bin_history_serious_mental_illness==FALSE & input$cov_bin_recent_serious_mental_illness==FALSE ~ "none")
-  # input[,c("cov_bin_history_serious_mental_illness","cov_bin_recent_serious_mental_illness")] <- NULL
-  # input$cov_cat_priorhistory_serious_mental_illness <- as.factor(input$cov_cat_priorhistory_serious_mental_illness)
-  # 
-  # input$cov_cat_priorhistory_self_harm <- dplyr::case_when(
-  #       input$cov_bin_history_self_harm==TRUE & input$cov_bin_recent_self_harm==TRUE ~ "recent",
-  #       input$cov_bin_history_self_harm==TRUE & input$cov_bin_recent_self_harm==FALSE ~ "notrecent",
-  #       input$cov_bin_history_self_harm==FALSE & input$cov_bin_recent_self_harm==TRUE ~ "recent",
-  #       input$cov_bin_history_self_harm==FALSE & input$cov_bin_recent_self_harm==FALSE ~ "none")
-  # input[,c("cov_bin_history_self_harm","cov_bin_recent_self_harm")] <- NULL
-  # input$cov_cat_priorhistory_self_harm <- as.factor(input$cov_cat_priorhistory_self_harm)
-
+    dplyr::mutate(end_date_outcome = min(end_date_outcome, out_date, na.rm = TRUE))
+
   # Make model input: main -------------------------------------------------------
 
   if (active_analyses$analysis[i]=="main") {
@@ -155,11 +113,11 @@ for (i in 1:nrow(active_analyses)) {
     df <- input[input$sub_bin_covid19_confirmed_history==FALSE,]
 
     df <- df %>% 
-      dplyr::mutate(end_date = replace(end_date, which(sub_cat_covid19_hospital=="non_hospitalised"), exp_date-1),
+      dplyr::mutate(end_date_outcome = replace(end_date_outcome, which(sub_cat_covid19_hospital=="non_hospitalised"), exp_date-1),
                     exp_date = replace(exp_date, which(sub_cat_covid19_hospital=="non_hospitalised"), NA),
-                    out_date = replace(out_date, which(out_date>end_date), NA))
+                    out_date = replace(out_date, which(out_date>end_date_outcome), NA))
 
-    df <- df[df$end_date>=df$index_date,]
+    df <- df[df$end_date_outcome>=df$index_date,]
 
     df[,colnames(df)[grepl("sub_",colnames(df))]] <- NULL
 
@@ -179,11 +137,11 @@ for (i in 1:nrow(active_analyses)) {
     df <- input[input$sub_bin_covid19_confirmed_history==FALSE,]
 
     df <- df %>% 
-      dplyr::mutate(end_date = replace(end_date, which(sub_cat_covid19_hospital=="hospitalised"), exp_date-1),
+      dplyr::mutate(end_date_outcome = replace(end_date_outcome, which(sub_cat_covid19_hospital=="hospitalised"), exp_date-1),
                     exp_date = replace(exp_date, which(sub_cat_covid19_hospital=="hospitalised"), NA),
-                    out_date = replace(out_date, which(out_date>end_date), NA))
+                    out_date = replace(out_date, which(out_date>end_date_outcome), NA))
 
-    df <- df[df$end_date>=df$index_date,]
+    df <- df[df$end_date_outcome>=df$index_date,]
     df$index_date <- as.Date(df$index_date)
 
     df[,colnames(df)[grepl("sub_",colnames(df))]] <- NULL

diff --git a/analysis/prelim.R b/analysis/prelim.R
@@ -20,15 +20,17 @@ prelim_data <- prelim_data %>%
                   unit = "days"))) %>%
   mutate(vax_date_covid_2_offset = vax_date_covid_2 + days(efficacy_offset),
          vax_date_eligible_offset = vax_date_eligible + days(eligibility_offset),
-         index_prevax = as.Date(study_dates$pandemic_start)) %>% 
+         index_prevax = as.Date(study_dates$pandemic_start),
+         index_prevax_extf = as.Date(study_dates$pandemic_start)) %>% 
   rowwise() %>%             
   mutate(index_vax = max(c(vax_date_covid_2_offset, delta_date), na.rm=T),
          index_unvax =  max(c(vax_date_eligible_offset, delta_date), na.rm=T),
+         index_unvax_extf =  max(c(vax_date_eligible_offset, delta_date), na.rm=T),
          end_vax = min(c(death_date, delta_end_date), na.rm=T),
-         end_unvax = min(c(death_date, delta_end_date), na.rm=T),
-         end_prevax = min(c(vax_date_eligible,death_date, vax_date_covid_1, all_eligible_date), na.rm=T),
-         end_prevax_exf = min(c(death_date, vax_date_covid_1, na.rm=T))) 
-
+         end_unvax = min(c(death_date, delta_end_date, vax_date_covid_1), na.rm=T),
+         end_unvax_extf = min(c(death_date, delta_end_date), na.rm=T),
+         end_prevax = min(c(vax_date_eligible, death_date, vax_date_covid_1, all_eligible_date), na.rm=T),
+         end_prevax_extf = min(c(death_date, delta_end_date), na.rm=T)) 
 
 #Write data to csv file 
 write_csv(prelim_data, "output/index_dates.csv.gz")
diff --git a/analysis/preprocess_data.R b/analysis/preprocess_data.R
@@ -45,7 +45,7 @@ df <- df %>%
 
 if(Sys.getenv("OPENSAFELY_BACKEND") %in% c("", "expectations") &&
    cohort_name %in% c("vax")) {
-  source("analysis/preprocess/modify_dummy_vax_data.R")
+  source("analysis/modify_dummy_vax_data.R")
   message("Vaccine information overwritten successfully")
 }
 

diff --git a/analysis/stage1_data_cleaning.R b/analysis/stage1_data_cleaning.R
@@ -15,36 +15,34 @@ library(arrow)
 args <- commandArgs(trailingOnly=TRUE)
 
 if(length(args)==0){
-  cohort_name <- "prevax"
+  cohort_name <- "unvax"
 } else {
   cohort_name <- args[[1]]
 }
 
-# Load json file containing vax study dates ------------------------------------
-
-study_dates <- fromJSON("output/study_dates.json")
-
-# Specify relevant dates -------------------------------------------------------
-
-vax_start_date <- as.Date(study_dates$vax1_earliest, format="%Y-%m-%d")
-mixed_vax_threshold <- as.Date("2021-05-07")
-start_date_delta <- as.Date(study_dates$delta_date, format="%Y-%m-%d")
-end_date_delta <- as.Date(study_dates$omicron_date, format="%Y-%m-%d") 
-
 # Define stage 1 function ------------------------------------------------------
 
 stage1 <- function(cohort_name) {
 
+  # Load json file containing vax study dates ------------------------------------
+
+  study_dates <- fromJSON("output/study_dates.json")
+
+  # Specify relevant dates -----------------------------------------------------
+
+  vax_start_date <- as.Date(study_dates$vax1_earliest, format="%Y-%m-%d")
+  mixed_vax_threshold <- as.Date("2021-05-07")
+  start_date_delta <- as.Date(study_dates$delta_date, format="%Y-%m-%d")
+  end_date_delta <- as.Date(study_dates$omicron_date, format="%Y-%m-%d") 
+
   ## Load cohort data ----------------------------------------------------------
 
   input <- read_rds(file.path("output", paste0("input_",cohort_name,".rds")))
   print(paste0(cohort_name,  " cohort: ", nrow(input), " rows in the input file"))
 
   ## Rename date variables -----------------------------------------------------
 
-  input <- input %>%
-    rename(index_date =!!sym(paste0("index_date_",cohort_name))) %>%
-    rename(end_date = !!sym(paste0("end_date_",cohort_name)))
+  input <- dplyr::rename(input, "index_date" = "index_date_cohort")
 
   ## Handle missing values -----------------------------------------------------
 
@@ -157,10 +155,8 @@ stage1 <- function(cohort_name) {
 
   ### Rule 5: HRT or COCP meds for men
 
-  input$rule5 <- FALSE
-  # input$rule5 <- NA
-  # input$rule5 <- ((input$cov_cat_sex=="Male" & input$qa_bin_hrt==TRUE) | 
-  #                   (input$cov_cat_sex=="Male" & input$qa_bin_cocp==TRUE))
+  input$rule5 <- NA
+ input$rule5 <- (input$cov_cat_sex=="Male" & input$qa_bin_hrtcocp==TRUE)
 
   ### Rule 6: Prostate cancer codes for women
 
@@ -325,7 +321,7 @@ stage1 <- function(cohort_name) {
     input <- input %>% filter (!is.na(index_date) & index_date <= end_date & index_date >= start_date_delta)
     cohort_flow[nrow(cohort_flow)+1,] <- c(nrow(input),as.numeric(cohort_flow[nrow(cohort_flow),"N"]) - nrow(input), "Criteria 13 (Inclusion): Patient index date is within the study start and end dates i.e patient is fully vaccinated before the study end date")
 
-  } else if (cohort_name == "unvax"){
+  } else if (cohort_name %in% c("unvax","unvax_extf")){
 
     ### Exclusion criteria 8: Have a record of one or more vaccination prior index date
     # i.e. Have a record of a first vaccination prior to index date
@@ -344,7 +340,7 @@ stage1 <- function(cohort_name) {
     cohort_flow[nrow(cohort_flow)+1,] <- c(nrow(input),as.numeric(cohort_flow[nrow(cohort_flow),"N"]) - nrow(input), "Criteria 9 (Exclusion): Missing or unknown JCVI group")
 
     ### Inclusion criteria 10: Index date is before cohort end date - will remove anyone whose eligibility date + 84 days is after study end date (only those with unknown JCVI group)
-    input <- input %>% filter (!is.na(index_date) & index_date <= end_date & index_date >= start_date_delta)
+    input <- input %>% filter (!is.na(index_date) & index_date <= end_date_exposure & index_date >= start_date_delta)
     cohort_flow[nrow(cohort_flow)+1,] <- c(nrow(input),as.numeric(cohort_flow[nrow(cohort_flow),"N"]) - nrow(input), "Criteria 10 (Inclusion): Patient index date is within the study start and end dates i.e patients eligibility date + 84 days is before the study end date")
 
   }
@@ -399,6 +395,7 @@ if (cohort_name == "all") {
   stage1("prevax_extf")
   stage1("vax")
   stage1("unvax")
+  stage1("unvax_extf")
 } else{
   stage1(cohort_name)
 }