Skip to content

Commit

Permalink
Improve date handling
Browse files Browse the repository at this point in the history
  • Loading branch information
venexia committed Feb 7, 2023
1 parent 52c56af commit 4d7793e
Show file tree
Hide file tree
Showing 18 changed files with 634 additions and 192 deletions.
14 changes: 14 additions & 0 deletions analysis/codelists.py
Original file line number Diff line number Diff line change
Expand Up @@ -746,4 +746,18 @@
anxiety_icd10,
ocd_icd10,
ptsd_icd10
)

# COCP
cocp_dmd = codelist_from_csv(
"codelists/user-elsie_horne-cocp_dmd.csv",
system="snomed",
column="dmd_id",
)

# HRT
hrt_dmd = codelist_from_csv(
"codelists/user-elsie_horne-hrt_dmd.csv",
system="snomed",
column="dmd_id",
)
78 changes: 49 additions & 29 deletions analysis/common_variables.py

Large diffs are not rendered by default.

99 changes: 63 additions & 36 deletions analysis/create_project_actions.R
Original file line number Diff line number Diff line change
Expand Up @@ -237,6 +237,17 @@ actions_list <- splice(
)
),

comment("Implement study_definition for unvax_extf"),

action(
name = "generate_study_population_unvax_extf",
run = "cohortextractor:latest generate_cohort --study-definition study_definition_unvax_extf --output-format csv.gz",
needs = list("vax_eligibility_inputs","generate_index_dates"),
highly_sensitive = list(
cohort = glue("output/input_unvax_extf.csv.gz")
)
),

comment("Preprocess data - prevax"),

action(
Expand Down Expand Up @@ -301,12 +312,28 @@ actions_list <- splice(
)
),

comment("Preprocess data - unvax_extf"),

action(
name = "preprocess_data_unvax_extf",
run = "r:latest analysis/preprocess_data.R unvax_extf",
needs = list("generate_index_dates", "generate_study_population_unvax_extf"),
moderately_sensitive = list(
describe = glue("output/describe_input_unvax_extf_stage0.txt"),
describe_venn = glue("output/describe_venn_unvax_extf.txt")
),
highly_sensitive = list(
cohort = glue("output/input_unvax_extf.rds"),
venn = glue("output/venn_unvax_extf.rds")
)
),

comment("Data cleaning - all cohorts"),

action(
name = "stage1_data_cleaning_all",
run = "r:latest analysis/stage1_data_cleaning.R all",
needs = list("preprocess_data_prevax","preprocess_data_prevax_extf","preprocess_data_vax", "preprocess_data_unvax","vax_eligibility_inputs"),
needs = list("preprocess_data_prevax","preprocess_data_prevax_extf","preprocess_data_vax", "preprocess_data_unvax", "preprocess_data_unvax_extf"),
moderately_sensitive = list(
refactoring = glue("output/meta_data_factors_*.csv"),
QA_rules = glue("output/QA_summary_*.csv"),
Expand All @@ -318,41 +345,41 @@ actions_list <- splice(
)
),

action(
name = glue("describe_file-input_prevax_stage1"),
run = glue("r:latest analysis/describe_file.R input_prevax_stage1 rds"),
needs = list("stage1_data_cleaning_all"),
moderately_sensitive = list(
describe_model_input = glue("output/describe-input_prevax_stage1.txt")
)
),

action(
name = glue("describe_file-input_prevax_extf_stage1"),
run = glue("r:latest analysis/describe_file.R input_prevax_extf_stage1 rds"),
needs = list("stage1_data_cleaning_all"),
moderately_sensitive = list(
describe_model_input = glue("output/describe-input_prevax_extf_stage1.txt")
)
),

action(
name = glue("describe_file-input_vax_stage1"),
run = glue("r:latest analysis/describe_file.R input_vax_stage1 rds"),
needs = list("stage1_data_cleaning_all"),
moderately_sensitive = list(
describe_model_input = glue("output/describe-input_vax_stage1.txt")
)
),

action(
name = glue("describe_file-input_unvax_stage1"),
run = glue("r:latest analysis/describe_file.R input_unvax_stage1 rds"),
needs = list("stage1_data_cleaning_all"),
moderately_sensitive = list(
describe_model_input = glue("output/describe-input_unvax_stage1.txt")
)
),
# action(
# name = glue("describe_file-input_prevax_stage1"),
# run = glue("r:latest analysis/describe_file.R input_prevax_stage1 rds"),
# needs = list("stage1_data_cleaning_all"),
# moderately_sensitive = list(
# describe_model_input = glue("output/describe-input_prevax_stage1.txt")
# )
# ),
#
# action(
# name = glue("describe_file-input_prevax_extf_stage1"),
# run = glue("r:latest analysis/describe_file.R input_prevax_extf_stage1 rds"),
# needs = list("stage1_data_cleaning_all"),
# moderately_sensitive = list(
# describe_model_input = glue("output/describe-input_prevax_extf_stage1.txt")
# )
# ),
#
# action(
# name = glue("describe_file-input_vax_stage1"),
# run = glue("r:latest analysis/describe_file.R input_vax_stage1 rds"),
# needs = list("stage1_data_cleaning_all"),
# moderately_sensitive = list(
# describe_model_input = glue("output/describe-input_vax_stage1.txt")
# )
# ),
#
# action(
# name = glue("describe_file-input_unvax_stage1"),
# run = glue("r:latest analysis/describe_file.R input_unvax_stage1 rds"),
# needs = list("stage1_data_cleaning_all"),
# moderately_sensitive = list(
# describe_model_input = glue("output/describe-input_unvax_stage1.txt")
# )
# ),

# comment("Stage 2 - Missing - Table 1 - all cohorts"),
#
Expand Down
66 changes: 12 additions & 54 deletions analysis/make_model_input.R
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,8 @@ for (i in 1:nrow(active_analyses)) {

input <- input[,unique(c("patient_id",
"index_date",
"end_date",
"end_date_exposure",
"end_date_outcome",
active_analyses$exposure[i],
active_analyses$outcome[i],
unlist(strsplit(active_analyses$strata[i], split = ";")),
Expand All @@ -75,60 +76,17 @@ for (i in 1:nrow(active_analyses)) {
"exp_date" = active_analyses$exposure[i])

input <- input %>%
dplyr::mutate(out_date = replace(out_date, which(out_date>end_date | out_date<index_date), NA),
exp_date = replace(exp_date, which(exp_date>end_date | exp_date<index_date), NA),
dplyr::mutate(out_date = replace(out_date, which(out_date>end_date_outcome | out_date<index_date), NA),
exp_date = replace(exp_date, which(exp_date>end_date_exposure | exp_date<index_date), NA),
sub_cat_covid19_hospital = replace(sub_cat_covid19_hospital, which(is.na(exp_date)),"no_infection"))

# Update end date to be outcome date where applicable ------------------------
print('Update end date to be outcome date where applicable')

input <- input %>%
dplyr::rowwise() %>%
dplyr::mutate(end_date = min(end_date, out_date, na.rm = TRUE))

# # Make three level history covariates ----------------------------------------
# print('Make three level history covariates')
#
# input$cov_cat_priorhistory_depression <- dplyr::case_when(
# input$cov_bin_history_depression==TRUE & input$cov_bin_recent_depression==TRUE ~ "recent",
# input$cov_bin_history_depression==TRUE & input$cov_bin_recent_depression==FALSE ~ "notrecent",
# input$cov_bin_history_depression==FALSE & input$cov_bin_recent_depression==TRUE ~ "recent",
# input$cov_bin_history_depression==FALSE & input$cov_bin_recent_depression==FALSE ~ "none")
# input[,c("cov_bin_history_depression","cov_bin_recent_depression")] <- NULL
# input$cov_cat_priorhistory_depression <- as.factor(input$cov_cat_priorhistory_depression)
#
# input$cov_cat_priorhistory_anxiety_general <- dplyr::case_when(
# input$cov_bin_history_anxiety==TRUE & input$cov_bin_recent_anxiety==TRUE ~ "recent",
# input$cov_bin_history_anxiety==TRUE & input$cov_bin_recent_anxiety==FALSE ~ "notrecent",
# input$cov_bin_history_anxiety==FALSE & input$cov_bin_recent_anxiety==TRUE ~ "recent",
# input$cov_bin_history_anxiety==FALSE & input$cov_bin_recent_anxiety==FALSE ~ "none")
# input[,c("cov_bin_history_anxiety","cov_bin_recent_anxiety")] <- NULL
# input$cov_cat_priorhistory_anxiety_general <- as.factor(input$cov_cat_priorhistory_anxiety_general)
#
# input$cov_cat_priorhistory_eating_disorders <- dplyr::case_when(
# input$cov_bin_history_eating_disorders==TRUE & input$cov_bin_recent_eating_disorders==TRUE ~ "recent",
# input$cov_bin_history_eating_disorders==TRUE & input$cov_bin_recent_eating_disorders==FALSE ~ "notrecent",
# input$cov_bin_history_eating_disorders==FALSE & input$cov_bin_recent_eating_disorders==TRUE ~ "recent",
# input$cov_bin_history_eating_disorders==FALSE & input$cov_bin_recent_eating_disorders==FALSE ~ "none")
# input[,c("cov_bin_history_eating_disorders","cov_bin_recent_eating_disorders")] <- NULL
# input$cov_cat_priorhistory_eating_disorders <- as.factor(input$cov_cat_priorhistory_eating_disorders)
#
# input$cov_cat_priorhistory_serious_mental_illness <- dplyr::case_when(
# input$cov_bin_history_serious_mental_illness==TRUE & input$cov_bin_recent_serious_mental_illness==TRUE ~ "recent",
# input$cov_bin_history_serious_mental_illness==TRUE & input$cov_bin_recent_serious_mental_illness==FALSE ~ "notrecent",
# input$cov_bin_history_serious_mental_illness==FALSE & input$cov_bin_recent_serious_mental_illness==TRUE ~ "recent",
# input$cov_bin_history_serious_mental_illness==FALSE & input$cov_bin_recent_serious_mental_illness==FALSE ~ "none")
# input[,c("cov_bin_history_serious_mental_illness","cov_bin_recent_serious_mental_illness")] <- NULL
# input$cov_cat_priorhistory_serious_mental_illness <- as.factor(input$cov_cat_priorhistory_serious_mental_illness)
#
# input$cov_cat_priorhistory_self_harm <- dplyr::case_when(
# input$cov_bin_history_self_harm==TRUE & input$cov_bin_recent_self_harm==TRUE ~ "recent",
# input$cov_bin_history_self_harm==TRUE & input$cov_bin_recent_self_harm==FALSE ~ "notrecent",
# input$cov_bin_history_self_harm==FALSE & input$cov_bin_recent_self_harm==TRUE ~ "recent",
# input$cov_bin_history_self_harm==FALSE & input$cov_bin_recent_self_harm==FALSE ~ "none")
# input[,c("cov_bin_history_self_harm","cov_bin_recent_self_harm")] <- NULL
# input$cov_cat_priorhistory_self_harm <- as.factor(input$cov_cat_priorhistory_self_harm)

dplyr::mutate(end_date_outcome = min(end_date_outcome, out_date, na.rm = TRUE))

# Make model input: main -------------------------------------------------------

if (active_analyses$analysis[i]=="main") {
Expand All @@ -155,11 +113,11 @@ for (i in 1:nrow(active_analyses)) {
df <- input[input$sub_bin_covid19_confirmed_history==FALSE,]

df <- df %>%
dplyr::mutate(end_date = replace(end_date, which(sub_cat_covid19_hospital=="non_hospitalised"), exp_date-1),
dplyr::mutate(end_date_outcome = replace(end_date_outcome, which(sub_cat_covid19_hospital=="non_hospitalised"), exp_date-1),
exp_date = replace(exp_date, which(sub_cat_covid19_hospital=="non_hospitalised"), NA),
out_date = replace(out_date, which(out_date>end_date), NA))
out_date = replace(out_date, which(out_date>end_date_outcome), NA))

df <- df[df$end_date>=df$index_date,]
df <- df[df$end_date_outcome>=df$index_date,]

df[,colnames(df)[grepl("sub_",colnames(df))]] <- NULL

Expand All @@ -179,11 +137,11 @@ for (i in 1:nrow(active_analyses)) {
df <- input[input$sub_bin_covid19_confirmed_history==FALSE,]

df <- df %>%
dplyr::mutate(end_date = replace(end_date, which(sub_cat_covid19_hospital=="hospitalised"), exp_date-1),
dplyr::mutate(end_date_outcome = replace(end_date_outcome, which(sub_cat_covid19_hospital=="hospitalised"), exp_date-1),
exp_date = replace(exp_date, which(sub_cat_covid19_hospital=="hospitalised"), NA),
out_date = replace(out_date, which(out_date>end_date), NA))
out_date = replace(out_date, which(out_date>end_date_outcome), NA))

df <- df[df$end_date>=df$index_date,]
df <- df[df$end_date_outcome>=df$index_date,]
df$index_date <- as.Date(df$index_date)

df[,colnames(df)[grepl("sub_",colnames(df))]] <- NULL
Expand Down
12 changes: 7 additions & 5 deletions analysis/prelim.R
Original file line number Diff line number Diff line change
Expand Up @@ -20,15 +20,17 @@ prelim_data <- prelim_data %>%
unit = "days"))) %>%
mutate(vax_date_covid_2_offset = vax_date_covid_2 + days(efficacy_offset),
vax_date_eligible_offset = vax_date_eligible + days(eligibility_offset),
index_prevax = as.Date(study_dates$pandemic_start)) %>%
index_prevax = as.Date(study_dates$pandemic_start),
index_prevax_extf = as.Date(study_dates$pandemic_start)) %>%
rowwise() %>%
mutate(index_vax = max(c(vax_date_covid_2_offset, delta_date), na.rm=T),
index_unvax = max(c(vax_date_eligible_offset, delta_date), na.rm=T),
index_unvax_extf = max(c(vax_date_eligible_offset, delta_date), na.rm=T),
end_vax = min(c(death_date, delta_end_date), na.rm=T),
end_unvax = min(c(death_date, delta_end_date), na.rm=T),
end_prevax = min(c(vax_date_eligible,death_date, vax_date_covid_1, all_eligible_date), na.rm=T),
end_prevax_exf = min(c(death_date, vax_date_covid_1, na.rm=T)))

end_unvax = min(c(death_date, delta_end_date, vax_date_covid_1), na.rm=T),
end_unvax_extf = min(c(death_date, delta_end_date), na.rm=T),
end_prevax = min(c(vax_date_eligible, death_date, vax_date_covid_1, all_eligible_date), na.rm=T),
end_prevax_extf = min(c(death_date, delta_end_date), na.rm=T))

#Write data to csv file
write_csv(prelim_data, "output/index_dates.csv.gz")
2 changes: 1 addition & 1 deletion analysis/preprocess_data.R
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ df <- df %>%

if(Sys.getenv("OPENSAFELY_BACKEND") %in% c("", "expectations") &&
cohort_name %in% c("vax")) {
source("analysis/preprocess/modify_dummy_vax_data.R")
source("analysis/modify_dummy_vax_data.R")
message("Vaccine information overwritten successfully")
}

Expand Down
39 changes: 18 additions & 21 deletions analysis/stage1_data_cleaning.R
Original file line number Diff line number Diff line change
Expand Up @@ -15,36 +15,34 @@ library(arrow)
args <- commandArgs(trailingOnly=TRUE)

if(length(args)==0){
cohort_name <- "prevax"
cohort_name <- "unvax"
} else {
cohort_name <- args[[1]]
}

# Load json file containing vax study dates ------------------------------------

study_dates <- fromJSON("output/study_dates.json")

# Specify relevant dates -------------------------------------------------------

vax_start_date <- as.Date(study_dates$vax1_earliest, format="%Y-%m-%d")
mixed_vax_threshold <- as.Date("2021-05-07")
start_date_delta <- as.Date(study_dates$delta_date, format="%Y-%m-%d")
end_date_delta <- as.Date(study_dates$omicron_date, format="%Y-%m-%d")

# Define stage 1 function ------------------------------------------------------

stage1 <- function(cohort_name) {

# Load json file containing vax study dates ------------------------------------

study_dates <- fromJSON("output/study_dates.json")

# Specify relevant dates -----------------------------------------------------

vax_start_date <- as.Date(study_dates$vax1_earliest, format="%Y-%m-%d")
mixed_vax_threshold <- as.Date("2021-05-07")
start_date_delta <- as.Date(study_dates$delta_date, format="%Y-%m-%d")
end_date_delta <- as.Date(study_dates$omicron_date, format="%Y-%m-%d")

## Load cohort data ----------------------------------------------------------

input <- read_rds(file.path("output", paste0("input_",cohort_name,".rds")))
print(paste0(cohort_name, " cohort: ", nrow(input), " rows in the input file"))

## Rename date variables -----------------------------------------------------

input <- input %>%
rename(index_date =!!sym(paste0("index_date_",cohort_name))) %>%
rename(end_date = !!sym(paste0("end_date_",cohort_name)))
input <- dplyr::rename(input, "index_date" = "index_date_cohort")

## Handle missing values -----------------------------------------------------

Expand Down Expand Up @@ -157,10 +155,8 @@ stage1 <- function(cohort_name) {

### Rule 5: HRT or COCP meds for men

input$rule5 <- FALSE
# input$rule5 <- NA
# input$rule5 <- ((input$cov_cat_sex=="Male" & input$qa_bin_hrt==TRUE) |
# (input$cov_cat_sex=="Male" & input$qa_bin_cocp==TRUE))
input$rule5 <- NA
input$rule5 <- (input$cov_cat_sex=="Male" & input$qa_bin_hrtcocp==TRUE)

### Rule 6: Prostate cancer codes for women

Expand Down Expand Up @@ -325,7 +321,7 @@ stage1 <- function(cohort_name) {
input <- input %>% filter (!is.na(index_date) & index_date <= end_date & index_date >= start_date_delta)
cohort_flow[nrow(cohort_flow)+1,] <- c(nrow(input),as.numeric(cohort_flow[nrow(cohort_flow),"N"]) - nrow(input), "Criteria 13 (Inclusion): Patient index date is within the study start and end dates i.e patient is fully vaccinated before the study end date")

} else if (cohort_name == "unvax"){
} else if (cohort_name %in% c("unvax","unvax_extf")){

### Exclusion criteria 8: Have a record of one or more vaccination prior index date
# i.e. Have a record of a first vaccination prior to index date
Expand All @@ -344,7 +340,7 @@ stage1 <- function(cohort_name) {
cohort_flow[nrow(cohort_flow)+1,] <- c(nrow(input),as.numeric(cohort_flow[nrow(cohort_flow),"N"]) - nrow(input), "Criteria 9 (Exclusion): Missing or unknown JCVI group")

### Inclusion criteria 10: Index date is before cohort end date - will remove anyone whose eligibility date + 84 days is after study end date (only those with unknown JCVI group)
input <- input %>% filter (!is.na(index_date) & index_date <= end_date & index_date >= start_date_delta)
input <- input %>% filter (!is.na(index_date) & index_date <= end_date_exposure & index_date >= start_date_delta)
cohort_flow[nrow(cohort_flow)+1,] <- c(nrow(input),as.numeric(cohort_flow[nrow(cohort_flow),"N"]) - nrow(input), "Criteria 10 (Inclusion): Patient index date is within the study start and end dates i.e patients eligibility date + 84 days is before the study end date")

}
Expand Down Expand Up @@ -399,6 +395,7 @@ if (cohort_name == "all") {
stage1("prevax_extf")
stage1("vax")
stage1("unvax")
stage1("unvax_extf")
} else{
stage1(cohort_name)
}
Loading

0 comments on commit 4d7793e

Please sign in to comment.