Skip to content

Commit

Permalink
Clean repo, up to preprocessing script
Browse files Browse the repository at this point in the history
  • Loading branch information
venexia committed Jan 31, 2023
1 parent 91c7d79 commit d530fe6
Show file tree
Hide file tree
Showing 52 changed files with 2,942 additions and 5,587 deletions.
96 changes: 51 additions & 45 deletions analysis/active_analyses.R

Large diffs are not rendered by default.

402 changes: 2 additions & 400 deletions analysis/codelists.py

Large diffs are not rendered by default.

2,905 changes: 1,278 additions & 1,627 deletions analysis/common_variables.py

Large diffs are not rendered by default.

42 changes: 21 additions & 21 deletions analysis/create_project_actions.R
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ cohorts <- unique(active_analyses$cohort)

# Determine which outputs are ready --------------------------------------------

success <- readxl::read_excel("C:/Users/aw15952/OneDrive - University of Bristol/grp-EHR/Projects/post-covid-outcome-tracker.xlsx",
success <- readxl::read_excel("../../OneDrive - University of Bristol/grp-EHR/Projects/post-covid-outcome-tracker.xlsx",
sheet = "mentalhealth",
col_types = c("text","text", "text", "text", "text", "text",
"text", "text", "text", "text", "text",
Expand Down Expand Up @@ -141,7 +141,7 @@ apply_model_function <- function(name, cohort, analysis, ipw, strata,
# arguments = c(cohort),
# needs = list("stage1_data_cleaning_all"),
# moderately_sensitive = list(
# input_table_2 = glue("output/review/descriptives/table2_{cohort}.csv")
# input_table_2 = glue("output/descriptives/table2_{cohort}.csv")
# )
# )
# )
Expand All @@ -166,19 +166,19 @@ actions_list <- splice(
run = "r:latest analysis/metadates.R",
highly_sensitive = list(
study_dates_json = glue("output/study_dates.json"),
vax_jcvi_groups= glue("output/vax_jcvi_groups.csv"),
vax_eligible_dates= ("output/vax_eligible_dates.csv")
vax_jcvi_groups= glue("output/vax_jcvi_groups.csv.gz"),
vax_eligible_dates= ("output/vax_eligible_dates.csv.gz")
)
),

comment("Generate prelim study_definition"),

action(
name = "generate_study_population_prelim",
run = "cohortextractor:latest generate_cohort --study-definition study_definition_prelim --output-format feather",
run = "cohortextractor:latest generate_cohort --study-definition study_definition_prelim --output-format csv.gz",
needs = list("vax_eligibility_inputs"),
highly_sensitive = list(
cohort = glue("output/input_prelim.feather")
cohort = glue("output/input_prelim.csv.gz")
)
),

Expand All @@ -189,7 +189,7 @@ actions_list <- splice(
run = "r:latest analysis/prelim.R",
needs = list("vax_eligibility_inputs","generate_study_population_prelim"),
highly_sensitive = list(
index_dates = glue("output/index_dates.csv")
index_dates = glue("output/index_dates.csv.gz")
)
),

Expand Down Expand Up @@ -233,8 +233,8 @@ actions_list <- splice(
run = "r:latest analysis/preprocess/preprocess_data.R prevax",
needs = list( "generate_index_dates","generate_study_population_prevax"),
moderately_sensitive = list(
describe = glue("output/not-for-review/describe_input_prevax_stage0.txt"),
describe_venn = glue("output/not-for-review/describe_venn_prevax.txt")
describe = glue("output/describe_input_prevax_stage0.txt"),
describe_venn = glue("output/describe_venn_prevax.txt")
),
highly_sensitive = list(
cohort = glue("output/input_prevax.rds"),
Expand All @@ -249,8 +249,8 @@ actions_list <- splice(
run = "r:latest analysis/preprocess/preprocess_data.R vax",
needs = list("generate_index_dates","generate_study_population_vax"),
moderately_sensitive = list(
describe = glue("output/not-for-review/describe_input_vax_stage0.txt"),
descrive_venn = glue("output/not-for-review/describe_venn_vax.txt")
describe = glue("output/describe_input_vax_stage0.txt"),
descrive_venn = glue("output/describe_venn_vax.txt")
),
highly_sensitive = list(
cohort = glue("output/input_vax.rds"),
Expand All @@ -265,8 +265,8 @@ actions_list <- splice(
run = "r:latest analysis/preprocess/preprocess_data.R unvax",
needs = list("generate_index_dates", "generate_study_population_unvax"),
moderately_sensitive = list(
describe = glue("output/not-for-review/describe_input_unvax_stage0.txt"),
describe_venn = glue("output/not-for-review/describe_venn_unvax.txt")
describe = glue("output/describe_input_unvax_stage0.txt"),
describe_venn = glue("output/describe_venn_unvax.txt")
),
highly_sensitive = list(
cohort = glue("output/input_unvax.rds"),
Expand All @@ -281,10 +281,10 @@ actions_list <- splice(
run = "r:latest analysis/preprocess/Stage1_data_cleaning.R all",
needs = list("preprocess_data_prevax","preprocess_data_vax", "preprocess_data_unvax","vax_eligibility_inputs"),
moderately_sensitive = list(
refactoring = glue("output/not-for-review/meta_data_factors_*.csv"),
QA_rules = glue("output/review/descriptives/QA_summary_*.csv"),
IE_criteria = glue("output/review/descriptives/Cohort_flow_*.csv"),
histograms = glue("output/not-for-review/numeric_histograms_*.svg")
refactoring = glue("output/meta_data_factors_*.csv"),
QA_rules = glue("output/descriptives/QA_summary_*.csv"),
IE_criteria = glue("output/descriptives/Cohort_flow_*.csv"),
histograms = glue("output/numeric_histograms_*.svg")
),
highly_sensitive = list(
cohort = glue("output/input_*.rds")
Expand Down Expand Up @@ -325,9 +325,9 @@ actions_list <- splice(
# run = "r:latest analysis/descriptives/Stage2_missing_table1.R all",
# needs = list("stage1_data_cleaning_all"),
# moderately_sensitive = list(
# Missing_RangeChecks = glue("output/not-for-review/Check_missing_range_*.csv"),
# DateChecks = glue("output/not-for-review/Check_dates_range_*.csv"),
# Descriptive_Table = glue("output/review/descriptives/Table1_*.csv")
# Missing_RangeChecks = glue("output/Check_missing_range_*.csv"),
# DateChecks = glue("output/Check_dates_range_*.csv"),
# Descriptive_Table = glue("output/descriptives/Table1_*.csv")
# )
# ),

Expand All @@ -343,7 +343,7 @@ actions_list <- splice(
# run = "r:latest analysis/descriptives/venn_diagram.R all",
# needs = list("preprocess_data_prevax","preprocess_data_vax", "preprocess_data_unvax", "stage1_data_cleaning_all","stage1_end_date_table_prevax", "stage1_end_date_table_vax", "stage1_end_date_table_unvax"),
# moderately_sensitive = list(
# venn_diagram = glue("output/review/venn-diagrams/venn_diagram_*"))
# venn_diagram = glue("output/venn-diagrams/venn_diagram_*"))
# ),

comment("Stage 5 - Run models"),
Expand Down
4 changes: 2 additions & 2 deletions analysis/grouping_variables.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,15 +16,15 @@
### import groups and dates
# jcvi_groups
jcvi_groups = pd.read_csv(
filepath_or_buffer='output/vax_jcvi_groups.csv',
filepath_or_buffer='output/vax_jcvi_groups.csv.gz',
dtype=str
)
dict_jcvi = {jcvi_groups['group'][i]: jcvi_groups['definition'][i] for i in jcvi_groups.index}
ratio_jcvi = {jcvi_groups['group'][i]: 1/len(jcvi_groups.index) for i in jcvi_groups.index}

# elig_dates
elig_dates = pd.read_csv(
filepath_or_buffer='output/vax_eligible_dates.csv',
filepath_or_buffer='output/vax_eligible_dates.csv.gz',
dtype=str
)
dict_elig = { elig_dates['date'][i] : elig_dates['description'][i] for i in elig_dates.index }
Expand Down
Binary file removed analysis/index_dates.feather
Binary file not shown.
4 changes: 2 additions & 2 deletions analysis/metadates.R
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ tribble(
"99", "DEFAULT",
)

readr::write_csv(jcvi_groups,here::here( "output","vax_jcvi_groups.csv"))
readr::write_csv(jcvi_groups,here::here( "output","vax_jcvi_groups.csv.gz"))

# create elig_dates ----
elig_dates <-
Expand Down Expand Up @@ -89,4 +89,4 @@ tribble(
"2100-12-31", "DEFAULT", "NA",
)

readr::write_csv(elig_dates, here::here("output","vax_eligible_dates.csv"))
readr::write_csv(elig_dates, here::here("output","vax_eligible_dates.csv.gz"))
4 changes: 2 additions & 2 deletions analysis/prelim.R
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ efficacy_offset <- 14
eligibility_offset <- 84

#Read in the output of study_definition_prelim and add dates variables
prelim_data <- arrow::read_feather("output/input_prelim.feather")
prelim_data <- readr::read_csv("output/input_prelim.csv.gz")
prelim_data <- prelim_data %>%
mutate(across(c(contains("_date")),
~ floor_date(
Expand All @@ -30,4 +30,4 @@ prelim_data <- prelim_data %>%


#Write data to csv file
write_csv(prelim_data, "output/index_dates.csv")
write_csv(prelim_data, "output/index_dates.csv.gz")
Loading

0 comments on commit d530fe6

Please sign in to comment.