-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhlp_redcap_import.py
82 lines (65 loc) · 2.73 KB
/
hlp_redcap_import.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
#!/usr/bin/env python3
"""
Takes the XLSX output of dempgraphic_report.py and makes it REDCap import ready
"""
import argparse
from datetime import date
import pandas as pd
import numpy as np
parser = argparse.ArgumentParser(
description='Convert XLSX demographic report into REDCap ready CSV file')
parser.add_argument('-f', '--file', required=True,
help='(required) XLSX demographic report file')
parser.add_argument('-i', '--startindex',
action='store',
help='Integer index to start with (broken; do not use yet)')
args = parser.parse_args()
sex_map = {
'Female': 0,
'Male': 1,
'Unknown or Not Reported': 2,
}
race_map = {
'American Indian / Alaska Native': 0,
'Asian': 1,
'Native Hawaiian or Other Pacific Islander': 2,
'Black or African American': 3,
'White': 4,
'More Than One Race': 5,
'Unknown or Not Reported': 6,
'Other': 6,
}
eth_map = {
'Hisp': 0,
'NonHisp': 1,
'Unknown or Not Reported': 2
}
# Assumes sheet hasn't been renamed
df = pd.read_excel(args.file, 'Demographic Data')
# Assumes file retains name that demographic_repory.py gave it
protocol = args.file[:args.file.index('_')]
# Convert all the text into the numbers REDCap stores them as
df['mturk_sex'] = df['Sex'].apply(lambda x: sex_map[x])
df['mturk_race'] = df['Race'].apply(lambda y: race_map[y])
df['mturk_ethnicity'] = df['Ethnicity'].apply(lambda z: eth_map[z])
# Just select the columns we care about into a new data frame
core_cols = ('workerid', 'mturk_sex', 'mturk_race', 'mturk_ethnicity')
out_df = df.loc[:, core_cols]
# FIXME: Something goes wrong with creating record_id if this is used
# If you, e.g. set args.startindex to 10, then the first record is mt20
# and the final 10 records get NaN as their record_id. After splitting
# record_id creating into steps I can tell you the list from `map` is fine
# but it gets mangled at the `pd.Series` step. But no clue why.
if args.startindex:
out_df.index += int(args.startindex)
# Each record needs a unique 'record_id'
out_df['record_id'] = pd.Series(map(lambda x: f'mt{x:04}', out_df.index.values))
# Age is pretty useless and mostly not capture so just set it to nothing
out_df['mturk_age'] = np.nan
out_df['mturk_demography_form_complete'] = 1 # This actually maps to "Unverified", but that's fine.
# REDCap expects a different name
out_df.rename(columns={'workerid': 'mturk_workerid'}, inplace=True)
# REDCap expects the columns in this order
out_columns = ('record_id', 'mturk_workerid', 'mturk_age', 'mturk_sex', 'mturk_race', 'mturk_ethnicity', 'mturk_demography_form_complete')
out_df = out_df.reindex(columns=out_columns)
out_df.to_csv(f'{protocol}-redcap-{date.today().isoformat()}.csv', index=False)