-
Notifications
You must be signed in to change notification settings - Fork 0
/
process.jl
104 lines (75 loc) · 2.63 KB
/
process.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
# process.jl
# OLD -- USE MAKE_DATA.JL
# example script for cleaning, processing, and writing the raw Honduras data
# These functions are agnostic to the specific variables requested from the specific datasets.
using DataFrames, DataFramesMeta, Dates
import CSV
using HondurasTools
# waves included in the data requested
# the paths should be in the same order
waves = [1, 2, 3];
basepath = "/WORKAREA/work/HONDURAS_GATES/E_FELTHAM/";
writepath = basepath * "clean_data/";
hh_paths = [
"WAVE1/v8_2021-03/honduras_households_WAVE1_v8.csv",
"WAVE2/v5_2021-03/honduras_households_WAVE2_v5.csv",
"WAVE3/v3_2021-03/honduras_households_WAVE3_v3.csv"
];
respondent_paths = [
"WAVE1/v8_2021-03/honduras_respondents_WAVE1_v8.csv",
"WAVE2/v5_2021-03/honduras_respondents_WAVE2_v5.csv",
"WAVE3/v3_2021-03/honduras_respondents_WAVE3_v3.csv",
];
cohort1pth = "COHORT_1/v1/hmb_respondents_cohort1_baseline_v1_E_FELTHAM_2022-09-08.csv";
cohort2pth = "COHORT_2/v1/hmb_respondents_cohort2_v1_E_FELTHAM_2022-09-08.csv";
con_paths = [
"WAVE1/v8_2021-03/honduras_connections_WAVE1_v8.csv",
"WAVE2/v5_2021-03/honduras_connections_WAVE2_v5.csv",
"WAVE3/v3_2021-03/honduras_connections_WAVE3_v3.csv"
];
# village paths
village_paths = [
]
# load data
resp = [
CSV.read(basepath * x, DataFrame; missingstring = "NA") for x in respondent_paths
];
@time resp = clean_respondent(
resp, waves; nokeymiss = true, selected = nothing
);
hh = [CSV.read(basepath * x, DataFrame; missingstring = "NA") for x in hh_paths];
@time hh = clean_household(hh; selected = nothing);
# microbiome data
mb1, mb2 = [
CSV.read(basepath * x, DataFrame; missingstring = "NA") for x in [cohort1pth, cohort2pth]
];
@time mb = clean_microbiome(mb1, mb2);
# village data
vdfs = [
CSV.read(
basepath * vpth, DataFrame; missingstring = "NA"
) for vpth in village_paths
];
vdf = clean_village(vdfs, waves)
# network data
conns = [CSV.read(
basepath * con_path, DataFrame; missingstring = "NA"
) for con_path in con_paths];
@time con = clean_connections(
conns,
waves;
alter_source = true,
same_village = true,
removemissing = true
);
# filter to relevant data desired
# response filters
# filter to data_source = 1
@subset!(dat, :data_source .== 1);
select!(dat, Not(:data_source));
# write
CSV.write(writepath * "resp_data_" * string(today()), resp);
CSV.write(writepath * "household_data_" * string(today()), hh);
CSV.write(writepath * "village_data_" * string(today()), vdf);
CSV.write(writepath * "microbiome_data_" * string(today()), mdat);
CSV.write(writepath * "connections_data_" * string(today()), con);