-
Notifications
You must be signed in to change notification settings - Fork 6
/
clean_clinical_data.py
executable file
·44 lines (29 loc) · 1.42 KB
/
clean_clinical_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
#!/usr/bin/env python3
"""Script to clean clinical datasets."""
import argparse
from pathlib import Path
from utils import load_demographic_data
PROJECT_ROOT = Path.cwd()
def main(dataset_name):
"""Clean the data from the clinical datasets.
We removed excluded subjects outside the age range [47,73] based on the UK Biobank data.
"""
# ----------------------------------------------------------------------------------------
participants_path = PROJECT_ROOT / 'data' / dataset_name / 'participants.tsv'
ids_path = PROJECT_ROOT / 'data' / dataset_name / 'freesurferData.csv'
output_ids_filename = dataset_name + '_cleaned_ids.csv'
# ----------------------------------------------------------------------------------------
outputs_dir = PROJECT_ROOT / 'outputs'
dataset = load_demographic_data(participants_path, ids_path)
dataset = dataset.loc[(dataset['Age'] >= 47) & (dataset['Age'] <= 73)]
dataset = dataset.drop_duplicates(subset='participant_id')
output_ids_df = dataset[['Image_ID']]
assert sum(output_ids_df.duplicated()) == 0
output_ids_df.to_csv(outputs_dir / output_ids_filename, index=False)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('-D', '--dataset_name',
dest='dataset_name',
help='Dataset name to clean the data.')
args = parser.parse_args()
main(args.dataset_name)