-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcrf.py
158 lines (131 loc) · 7.28 KB
/
crf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
import os
import time
import pandas as pd
import shutil
# Get the current date in the format YYMMDD
date = time.strftime('%y%m%d')
# Path to the directory containing the results
base_path = '/media/prew01/BEST/BEST4/surveillance/'
# Output directory for the combined results
output_dir = '/media/prew01/BEST/BEST4/surveillance/results/'
if not os.path.exists(output_dir):
os.makedirs(output_dir, exist_ok=True)
# Load the data
qc = pd.read_csv(os.path.join(base_path, f'he/features/40x_400/results/qc_process_list_{date}.csv'))
# Remap Column Names
qc_column_mapping = {
'slide_filename': 'h_e_slide_filename',
'positive_tiles': 'he_numb_gastric_tile_alg',
'algorithm_result': 'h_e_qc_result_alg',
'tile_mapping': 'qc_tile_mapping',
'algorithm_version': 'qc_algorithm_version'
}
qc = qc.rename(columns=qc_column_mapping)
atypia = pd.read_csv(os.path.join(base_path, f'he/features/40x_400/results/he_process_list_{date}.csv'))
atypia_column_mapping = {
'slide_filename': 'h_e_slide_filename',
'positive_tiles': 'atypia_positive_tiles',
'algorithm_result': 'atypia_algorithm_result',
'tile_mapping': 'atypia_tile_mapping',
'algorithm_version': 'atypia_algorithm_version'
}
atypia = atypia.rename(columns=atypia_column_mapping)
p53 = pd.read_csv(os.path.join(base_path, f'p53/features/40x_400/results/p53_process_list_{date}.csv'))
p53_column_mapping = {
'slide_filename': 'p53_slide_filename',
'positive_tiles': 'p53_positive_tiles',
'algorithm_result': 'p53_algorithm_result',
'tile_mapping': 'p53_tile_mapping',
'algorithm_version': 'p53_algorithm_version'
}
p53 = p53.rename(columns=p53_column_mapping)
tff3 = pd.read_csv(os.path.join(base_path, f'tff3/features/40x_400/results/tff3_process_list_{date}.csv'))
tff3_column_mapping = {
'slide_filename': 'tff3_slide_filename',
'positive_tiles': 'tff3_positive_tiles',
'algorithm_result': 'tff3_algorithm_result',
'tile_mapping': 'tff3_tile_mapping',
'algorithm_version': 'tff3_algorithm_version'
}
tff3 = tff3.rename(columns=tff3_column_mapping)
# Step 1: Initialize DataFrame for mapping ids
record_ids = pd.read_csv(os.path.join(base_path, 'data/BarrettsOESophagusTr-BEST4CambridgeLabSam_DATA_LABELS_2024-11-12_1458.csv'))
record_ids = record_ids.dropna(subset=['Cyted Lab Number (Format: YYCYT#####)'])
participant_ids = dict(zip(record_ids['Cyted Lab Number (Format: YYCYT#####)'], record_ids['Participant ID: ']))
# Repeat record ids
repeat_record_ids = pd.read_csv(os.path.join(base_path, 'data/BarrettsOESophagusTr-BEST4CambridgeLabRep_DATA_LABELS_2024-11-12_1458.csv'))
repeat_record_ids = repeat_record_ids.dropna(subset=['Cyted Lab Number (Format: YYCYT#####)'])
repeat_participant_ids = dict(zip(repeat_record_ids['Cyted Lab Number (Format: YYCYT#####)'], repeat_record_ids['Participant ID: ']))
columns=['record_id', 'redcap_event_name', 'redcap_repeat_instrument', 'redcap_repeat_instance']
# Step 2: List of DataFrames to append
dfs = [qc, atypia, p53, tff3]
# Step 3: Merge all the DataFrames on 'algorithm_cyted_sample_id'
#merge the first two dataframes on 'algorithm_cyted_sample_id' but maintain column order
appended_df = pd.merge(qc, atypia, left_on='algorithm_cyted_sample_id', right_on='algorithm_cyted_sample_id', how='outer', suffixes=(None, '_atypia'))
drop = [col for col in appended_df.columns if '_atypia' in col]
appended_df = appended_df.drop(columns=drop)
appended_df = pd.merge(appended_df, p53, left_on='algorithm_cyted_sample_id', right_on='algorithm_cyted_sample_id', how='outer')
appended_df = pd.merge(appended_df, tff3, left_on='algorithm_cyted_sample_id', right_on='algorithm_cyted_sample_id', how='outer')
#add columns as the first 4 columns of the appended_df
appended_df = appended_df.reindex(columns=columns + appended_df.columns.tolist())
# Step 4: Iterate through the appended_df and record_ids to match the record_id
not_found_cases = []
for index, row in appended_df.iterrows():
cyted_sample_id = row['algorithm_cyted_sample_id']
if cyted_sample_id in participant_ids.keys():
appended_df.at[index, 'record_id'] = participant_ids[cyted_sample_id]
appended_df.at[index, 'redcap_repeat_instance'] = 1
elif cyted_sample_id in repeat_participant_ids.keys():
appended_df.at[index, 'record_id'] = repeat_participant_ids[cyted_sample_id]
appended_df.at[index, 'redcap_repeat_instance'] = 2
else:
not_found_cases.append(cyted_sample_id)
# Print cases not found in the reference dataset
if not_found_cases:
print("Cases not found in the dataset:")
for case in not_found_cases:
print(case)
appended_df['redcap_event_name'] = 'unscheduled_arm_1'
appended_df['redcap_repeat_instrument'] = 'machine_learning_pathology_results'
# Sort the DataFrame by record_id and redcap_repeat_instance
appended_df.sort_values(by=['record_id', 'redcap_repeat_instance'], inplace=True)
output_path = os.path.join(output_dir, f'BEST4_AI_crfs_{date}.csv')
print(f'Saving appended data to {output_path}')
appended_df.to_csv(output_path, index=False) # Save the appended data to a CSV file
for case, row in appended_df.iterrows():
best4_case_id = row['record_id']
if pd.isnull(best4_case_id):
continue
repeat = row["redcap_repeat_instance"]
instance = f'{best4_case_id}-{int(repeat)}'
case_dir = os.path.join(output_dir, instance)
if not os.path.exists(case_dir+'.zip'):
os.makedirs(case_dir, exist_ok=True)
else:
continue
# Save the individual results to the case directory
shutil.copytree(os.path.join(base_path, f'he/features/40x_400/results/{row["h_e_slide_filename"]}'), f'{case_dir}/{row["h_e_slide_filename"]}')
shutil.copytree(os.path.join(base_path, f'p53/features/40x_400/results/{row["p53_slide_filename"]}'), f'{case_dir}/{row["p53_slide_filename"]}')
shutil.copytree(os.path.join(base_path, f'tff3/features/40x_400/results/{row["tff3_slide_filename"]}'), f'{case_dir}/{row["tff3_slide_filename"]}')
#zip the case directory and save to the output directory and delete the case directory
shutil.make_archive(os.path.join(output_dir, instance), 'zip', os.path.join(output_dir, instance))
shutil.rmtree(case_dir)
# Step 5: Check which cases are not found in the output directory
output_files = os.listdir(output_dir)
output_cases = [os.path.splitext(file)[0] for file in output_files if file.endswith('.zip')]
# Extract participant IDs from the output cases
output_participant_ids = set(case.split('.')[0] for case in output_cases)
#join the participant ids with 1 for the first instance and 2 for the second instance
participant_ids = {k: f'{v}-1' for k, v in participant_ids.items()}
repeat_participant_ids = {k: f'{v}-2' for k, v in repeat_participant_ids.items()}
# Combine the participant IDs and repeat participant IDs dictionaries into a single dictionary reversing the keys and values
all_participant_ids = {v: k for k, v in {**participant_ids, **repeat_participant_ids}.items()}
# Check if any participant IDs are missing from the output directory
missing_participant_ids = set(all_participant_ids.keys()) - output_participant_ids
# Print missing participant IDs
if missing_participant_ids:
print("Participant IDs missing from the output directory:")
print("Participant ID: Cyted Sample ID")
for pid in missing_participant_ids:
print(pid, all_participant_ids[pid])
print("Done!")