-
Notifications
You must be signed in to change notification settings - Fork 10
/
Copy pathutil.py
1465 lines (1204 loc) · 71.5 KB
/
util.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env python
import os
import sys
import argparse
import numpy as np
from skimage import io, exposure, transform, img_as_float
import pydicom
import re
import pathlib
import requests, zipfile
import pandas as pd
import matplotlib.pyplot as plt
plt.switch_backend('agg')
import re
import yaml
from anytree import AnyNode
from anytree.exporter import DictExporter
from anytree.importer import DictImporter
from pprint import pprint # just for nice printing
from anytree import RenderTree , search # just for nice printing
import remotedebugger as rd
parser = argparse.ArgumentParser(description='Util methods')
#rd.attachDebugger(parser)
parser.add_argument('-s', metavar='MR_ID_XNAT', type=str, nargs=1,
help='display DICOM info for all images of a study identified with MR ID XNAT')
parser.add_argument('-ps', metavar='MR_ID_XNAT', type=str, nargs=1,
help='display DICOM info for all images of all studies available for a patient that has a study identified with MR ID XNAT')
parser.add_argument('-p', metavar='PatientID', type=str, nargs=1,
help='display DICOM info for all images of all studies available for a patient identified with patientID')
parser.add_argument('-m', metavar='DICOM field modality',
help='summarize all values in dataset for the DICOM field modality')
parser.add_argument('-f', metavar='filename', type=str, nargs='?', default= True,
help='save in filename all info for all studies. Default filename: /all_info_studies.csv ')
parser.add_argument('-d', metavar='filename', type=str, nargs='?', default= True,
help='describe all info for all studies. Default filename: /all_info_studies.csv ')
parser.add_argument('-dc', metavar='categorical_field', type=str,
help='summarize DICOM field categories for all studies. Default filename: /all_info_studies.csv. Possible categorical fields are '
'all,Modality,SeriesDescription,ProtocolName, BodyPartExamined,ViewPosition, CodeMeaning,PhotometricInterpretation, Manufacturer')
parser.add_argument('-dn', metavar='numerical_field', type=str,
help='summarize DICOM numerical field for all studies. Default filename: /all_info_studies.csv. Possible numerical fields are '
'PatientBirth,StudyDate,Rows,Columns,PixelAspectRatio,SpatialResolution, XRayTubeCurrent,ExposureTime, ExposureInuAs,Exposure, RelativeXRayExposure, BitsStored, PixelRepresentation, WindowCenter, WindowWidth')
parser.add_argument('-split',action='store_true',
help='split in type of projections views based on manual review of DICOM info and ResNet50.')
parser.add_argument('-e',action='store_true',
help='generate list of images to exclude. It excludes non Rx thorax studies or if patient position is not vertical or if image has not associated report. Default source filename: /all_info_studies.csv.')
parser.add_argument('-imgs', metavar='MR_ID_XNAT', type=str, nargs=1,
help='preprocess all images of a study identified with MR ID XNAT')
parser.add_argument('-imgm', action='store_true',
help='generate mean X-Ray picture and mean standard deviation picture for each position')
parser.add_argument('-fst', metavar='filename', type=str, nargs='?', default= True,
help='save in filename all info for all studies by sentence topic. Default filename: /all_info_studies_sent_topics.csv ')
parser.add_argument('-est', metavar='filename', type=str, nargs='?', default= True,
help='extract and save in filename sentence topics. Default filename: /extract_sent_topics.csv ')
parser.add_argument('-public', action='store_true', help='generate final File with all info to be public')
#parser.add_argument('u', metavar='username', type=str, nargs=1,
#help='XNAT account username')
#parser.add_argument('p', metavar='password', type=str, nargs=1,
#help='XNAT account password')
#all_info_studies_file = '/all_info_studies_nonXNAT.csv'
all_info_studies_file = '/all_info_studies.csv'
all_info_studies_st_file = 'all_info_studies_sent_topics.csv'
extract_topics_file = 'extract_sent_topics.csv'
args = parser.parse_args()
ID_XNAT = args.s[0] if args.s else None
patient_ID_XNAT = args.ps[0] if args.ps else None
patient_ID = args.p[0] if args.p else None
modality = args.m if args.m else None
filename = all_info_studies_file if args.f is None else None
filename_st = all_info_studies_st_file if args.fst is None else None
filename_to_describe = all_info_studies_file if args.d is None else None
categorical_field = args.dc if args.dc else None
numerical_field = args.dn if args.dn else None
solve_images_projection = args.split if args.split else None
exclude = args.e if args.e else None
imgs_ID_XNAT = args.imgs[0] if args.imgs else None
image_mean = args.imgm if args.imgm else None
extract_topics = extract_topics_file if args.est is None else None
save_public_file = args.public if args.public else None
#j_username = args.u[0] if args.u else ''
#j_password = args.p[0] if args.p else ''
currentroot = os.getcwd()
os.chdir("../")
root = os.getcwd()
os.chdir(currentroot)
#get dictionary of image paths and image modality (CT, RX, DX) for a given study identified with ID XNAT
def getImagesPath(ID_XNAT ):
path = root + '/SJ/image_dir/'
fieldValues = {} # create an empty field dictionary of values
walk_path = path if not ID_XNAT else path + ID_XNAT
for dirName, subdirList, fileList in os.walk(walk_path):
for filename in fileList:
if ".dcm" in filename.lower(): # check whether the file's DICOM
filename = os.path.join(dirName,filename)
e = np.fromfile(filename, dtype='>u2')
try:
RefDs = pydicom.read_file(filename)
value = RefDs.Modality
fieldValues[filename] = value
except:
pass
print(fieldValues)
print(fieldValues)
return fieldValues
def getDicomFields(value):
pattern = re.compile('[\W_]+')
images_info = {}
images_info["PatientID"] = value.PatientName if 'PatientName' in value else None #codified patient's name //TODO: use it to ensure that the same patient is not both in the training and testing sets to avoid overfitting
images_info["PatientBirth"] = str(value.PatientBirthDate)[:4] if 'PatientBirthDate' in value else None
images_info["PatientSex"] = value.PatientSex if 'PatientSex' in value else None
images_info['StudyDate'] = value.StudyDate if 'StudyDate' in value else None
images_info['Modality'] = pattern.sub('', value.Modality) if 'Modality' in value else None
images_info['SeriesDescription'] = pattern.sub('', value.SeriesDescription) if 'SeriesDescription' in value else None
images_info['ProtocolName'] = pattern.sub('', value.ProtocolName) if 'ProtocolName' in value else None
images_info['CodeMeaning'] = pattern.sub('', value.ProcedureCodeSequence[0].CodeMeaning) if 'ProcedureCodeSequence' in value and 'CodeMeaning' in value.ProcedureCodeSequence[0] else None
images_info['Manufacturer'] = pattern.sub('', value.Manufacturer) if 'Manufacturer' in value else None
images_info['ViewPosition'] = pattern.sub('', value.ViewPosition) if 'ViewPosition' in value else None
images_info['BodyPartExamined'] = pattern.sub('', value.BodyPartExamined) if 'BodyPartExamined' in value else None #After reviewing this field it is innacurate and inconsistent
images_info['Rows'] = value.Rows if 'Rows' in value else None
images_info['Columns'] = value.Columns if 'Columns' in value else None
images_info['PixelAspectRatio'] = value.PixelAspectRatio if 'PixelAspectRatio' in value else None
images_info['SpatialResolution'] = value.SpatialResolution if 'SpatialResolution' in value else None
images_info['PhotometricInterpretation'] = pattern.sub('', value.PhotometricInterpretation) if 'PhotometricInterpretation' in value else None
images_info['BitsStored'] = value.BitsStored if 'BitsStored' in value else None
images_info['PixelRepresentation'] = value.PixelRepresentation if 'PixelRepresentation' in value else None
images_info['WindowCenter'] = value.WindowCenter if 'WindowCenter' in value else None
images_info['WindowWidth'] = value.WindowWidth if 'WindowWidth' in value else None
images_info['RelativeXRayExposure'] = value.RelativeXRayExposure if 'RelativeXRayExposure' in value else None
images_info['ExposureTime'] = value.ExposureTime if 'ExposureTime' in value else None #Expressed in sec
images_info['XRayTubeCurrent'] = value.XRayTubeCurrent if 'XRayTubeCurrent' in value else None
images_info['ExposureInuAs'] = value.ExposureInuAs if 'ExposureInuAs' in value else None
images_info['Exposure'] = value.Exposure if 'Exposure' in value else None
return images_info
def getDicomInfoStudy(ID_XNAT):
path = root + '/SJ'
walk_path = path + '/image_dir/' + ID_XNAT
images = {}
print(str(ID_XNAT))
for dirName, subdirList, fileList in os.walk(walk_path ):
for filename in fileList:
if ".dcm" in filename.lower(): # check whether the file's DICOM
try:
RefDs = pydicom.read_file(os.path.join(dirName,filename),force=True)
images['/image_dir_processed/' + ID_XNAT + '_' + filename[-10:-4] + '.png'] = RefDs
except:
print('Failed opening DICOM file')
pass
return images
# Get report + images + Dicom info for a given study identified by ID XNAT
# Precondition: Images need to be previously preprocessed and stored in image_dir
# Arguments: The study identified by MR ID XNAT e.g 171456198269648269029527880939880883235
# Return: the list of image dictionaries that belongs to a study. Each dict contains patient's and image information fields
def getAllInfo(ID_XNAT, dataset_asoc_DF = None, report_DF = None):
study_info = {}
#access number of report
dataset_asoc = '/dataset_asoc_abril18.csv'
path = root + '/Rx-thorax-automatic-captioning' + dataset_asoc
dataset_asoc_DF = pd.read_csv(path, sep = ';', ) if dataset_asoc_DF is None else dataset_asoc_DF
exp_id = dataset_asoc_DF.loc[dataset_asoc_DF['MR ID XNAT'] == ID_XNAT]
study_info['codigoinforme'] = None
#access report
try:
#access number of report
dataset_asoc = '/dataset_asoc_abril18.csv'
path = root + '/Rx-thorax-automatic-captioning' + dataset_asoc
dataset_asoc_DF = pd.read_csv(path, sep = ';', ) if dataset_asoc_DF is None else dataset_asoc_DF
exp_id = dataset_asoc_DF.loc[dataset_asoc_DF['MR ID XNAT'] == ID_XNAT]
exp_id = int(exp_id.iloc[0]['Access Number'])
study_info['codigoinforme'] = int(str(exp_id)[-7:])
#access report
filename = '/report_sentences_preprocessed.csv'
path = root + '/Rx-thorax-automatic-captioning' + filename
report_DF = pd.read_csv(path, sep = ',', encoding='ISO-8859-1') if report_DF is None else report_DF
text = report_DF.loc[report_DF['codigoinforme'] == study_info['codigoinforme']]
study_info['report'] = text.iloc[0]['v_preprocessed']
except:
study_info['report'] = None
print("MISSING REPORT: " +str(ID_XNAT) + ' ' + str(study_info['codigoinforme']))
pass
print(study_info)
#access study's images
images =[]
dicoms = getDicomInfoStudy(ID_XNAT)
for key, value in dicoms.items():
print(value)
images_info = getDicomFields(value)
images_info['ReportID'] = study_info['codigoinforme']
images_info['Report'] = study_info['report']
images_info["ImagePath"] = key #TODO: image path should be the preprocessed image path not the dcm path
images_info["StudyID"] = ID_XNAT #A study has 1 to Many images
images.append(images_info)
return images
def saveAllStudyInfoFullDataset(save_file = all_info_studies_file, dataset_asoc_file = '/dataset_asoc_10042018.csv'):
dataset_asoc = '/dataset_asoc_10042018.csv' if not dataset_asoc_file else dataset_asoc_file
path = root + '/Rx-thorax-automatic-captioning' + dataset_asoc
dataset_asoc_DF = pd.read_csv(path, sep = ',', )
reports = '/report_sentences_preprocessed.csv'
path = root + '/Rx-thorax-automatic-captioning' + reports
report_DF = pd.read_csv(path, sep = ',', encoding='ISO-8859-1')
all_studies_DF = None
empty = False
path = root + '/Rx-thorax-automatic-captioning' + save_file
if os.path.exists(path):
all_studies_DF = pd.read_csv(path, sep = ';' , header = 0)
else:
empty = True
f = open(path, 'a')
for index, r in dataset_asoc_DF.iterrows():
studyID = r['MR ID XNAT']
if all_studies_DF is None or not all_studies_DF['StudyID'].str.contains(studyID).any():
imagesInfo = getAllInfo(studyID,dataset_asoc_DF, report_DF)
for img in imagesInfo:
if empty:
f.write(";".join(img.keys()) + '\n')
empty = False
f.writelines(";".join(str(e) for e in img.values()) + '\n')
f.close()
def saveAllStudyInfoNonXNATDataset(save_file = '/all_info_studies_nonXNAT.csv', dataset_asoc_file = '/pacientes_bloque_nuevo_220518.txt'):
dataset_asoc = dataset_asoc_file if dataset_asoc_file is not None else '/pacientes_bloque_nuevo_220518.txt' #Columns: "Access Number Study instance UID Id Anonimized"
all_studies_DF = None
empty = False
path = root + '/Rx-thorax-automatic-captioning' + save_file
if os.path.exists(path):
all_studies_DF = pd.read_csv(path, sep = ';' , header = 0)
else:
empty = True
f = open(path, 'a')
#Access each .dcm in anidated folder
path = root + '/SJ'
walk_path = path + '/salinas/'
for dirName, subdirList, fileList in os.walk(walk_path ):
for filename in fileList:
if ".dcm" in filename.lower(): # check whether the file's DICOM
DICOMImagePath = dirName + "/" + filename
if all_studies_DF is None or (DICOMImagePath not in all_studies_DF["DICOMImagePath"].values):
try:
RefDs = pydicom.read_file(os.path.join(dirName,filename),force=True)
image_info = getDicomFields(RefDs)
P_ID = RefDs.PatientID if 'PatientID' in RefDs else None #New image dataset does not have available XNAT ID, therefore PatientID from DICOM is used instead. Do not confuss with PatientName value
ID_XNAT = RefDs.StudyInstanceUID if 'StudyInstanceUID' in RefDs else None
ID_XNAT = ID_XNAT.replace('.','')
image_info["StudyID"] = ID_XNAT #A xnat study has 1 to Many images,
#access number of report from DICOM
path = root + '/Rx-thorax-automatic-captioning' + dataset_asoc
if ('RequestAttributesSequence' in RefDs and 'RequestedProcedureID' in RefDs.RequestAttributesSequence[0] ):
report_id = RefDs.RequestAttributesSequence[0].RequestedProcedureID
image_info['ReportID'] = int(str(report_id)[-7:])
else: #If not in DICOM, try to retrieve it from association file
dataset_asoc_DF = pd.read_csv(path, sep = '\t', )
exp_id = dataset_asoc_DF.loc[dataset_asoc_DF['Study instance UID'] == ID_XNAT]
exp_id = int(exp_id.iloc[0]['Access Number'])
image_info['ReportID'] = int(str(exp_id)[-7:])
try:
#access report
filename = '/report_sentences_preprocessed.csv'
path = root + '/Rx-thorax-automatic-captioning' + filename
report_DF = pd.read_csv(path, sep = ',', encoding='ISO-8859-1')
text = report_DF.loc[report_DF['codigoinforme'] == image_info['ReportID']]
image_info['Report'] = text.iloc[0]['v_preprocessed']
except:
image_info['Report'] = None
print("MISSING REPORT: " +str(ID_XNAT) + ' ' + str(image_info['ReportID']))
pass
suffix_image_id = DICOMImagePath.replace('/','-')
image_info["ImagePath"] = '/image_dir_processed/' + str(ID_XNAT) + '_' + suffix_image_id[-14:-4] + '.png'
image_info["DICOMImagePath"] = DICOMImagePath
if empty:
f.write(";".join(image_info.keys()) + '\n')
empty = False
f.writelines(";".join(str(e) for e in image_info.values()) + '\n')
except:
print('Failed opening DICOM file: ' + DICOMImagePath, sys.exc_info())
#raise
pass
#else:
#print(DICOMImagePath + "skipped, already in file")
f.close()
return
def summarizeAllStudiesDicomModality():
path = root + '/SJ/image_dir/'
fieldValues = {} # create an empty field dictionary of values
samples = {}
for dirName, subdirList, fileList in os.walk(path):
for filename in fileList:
if ".dcm" in filename.lower(): # check whether the file's DICOM
filename = os.path.join(dirName,filename)
e = np.fromfile(filename, dtype='>u2')
try:
RefDs = pydicom.read_file(filename)
value = RefDs.Modality
if value in fieldValues.keys():
fieldValues[value] += 1
else:
fieldValues[value] = 1
samples[value] = filename
except:
pass
print(fieldValues)
print(samples)
print(fieldValues)
print(samples)
return fieldValues, samples
def generatePositionsFileToReview():
dfa = pd.read_csv('all_info_studies_labels_160K.csv', header = 0)
dfs = pd.read_csv('SJ_chest_x_ray_images_labels_160K.csv', header = 0)
dfs['ImageID'] = dfs['ImageID'].astype(str)
dfa['ImageID'] = dfa['ImagePath'].str.split("/").str.get(-1)
dfa['ImageID'] = dfa['ImageID'].astype(str)
m = pd.merge(dfa,dfs,how='left', on='ImageID')
m = m[['Projection','ViewPosition','CodeMeaning','ProtocolName','SeriesDescription', 'ImageID']].astype(str)
c = m.groupby(['Projection','ViewPosition','ProtocolName','SeriesDescription','CodeMeaning']).ImageID.count()
c = pd.DataFrame(c)
c.to_csv('Positions.csv')
return
def solve_images_projection():
dfa = pd.read_csv('all_info_studies_labels_160K_Ene19.csv', header = 0).astype(str)
print(dfa.shape)
#projections are resolved manually based on non-structured text in 'ViewPosition','CodeMeaning','ProtocolName','SeriesDescription']
#those non resolvable based on those fields are marked as "UNK"
#merge files
pr = pd.read_csv('Positions_Reviewed.csv', header = 0).astype(str)
m = pd.merge(dfa, pr, how='left', on=['ViewPosition','CodeMeaning','ProtocolName','SeriesDescription'])
m.drop_duplicates(subset = 'ImagePath', inplace=True)
dfa = pd.merge(dfa, m[['ImagePath','Review', 'Pediatric']], on='ImagePath', how='left')
#UNK projections are then resolved in PA and L by pretrained model based on ResNet50
#merge files, Review_x contains projections manually reviewed, Review_y contains projections classified by RESNET
dfu = pd.read_csv('../chestViewSplit/all_info_studies_labels_projections_160K.csv', header = 0).astype(str)
dfa = pd.merge(dfa, dfu[['ImagePath','Review']], on='ImagePath', how='left')
dfa['MethodProjection'] = 'Manual review of DICOM fields'
dfa.loc[dfa['Review_x'] == 'UNK','MethodProjection'] = 'resnet-50.t7'
dfa.to_csv('all_info_studies_labels_projections_160K_Ene19.csv')
return
def summarizeAllStudiesByCategoricalField (file = all_info_studies_file, categorical_field = None):
#Return dataframe where each row is one class and values contains the number of ocurrences and one example
#Possible categorical fields are: 'Modality;SeriesDescription;ProtocolName;ViewPosition;PhotometricInterpretation'
#all_studies_DF = pd.read_csv(root + '/Rx-thorax-automatic-captioning' + file, sep = ';' , header = 0)
all_info_studies_file_prefix = "/all_info_studies"
all_studies_DF = pd.read_csv(root + '/Rx-thorax-automatic-captioning' + all_info_studies_file_prefix + '_nonXNAT.csv', sep = ';' , header = 0)
all_studies_DF = pd.concat([all_studies_DF,pd.read_csv(root + '/Rx-thorax-automatic-captioning' + all_info_studies_file_prefix + '.csv', sep = ';' , header = 0)], sort=False)
all_studies_DF = all_studies_DF.drop_duplicates()
n = all_studies_DF.groupby(categorical_field).first()
c = all_studies_DF[categorical_field].value_counts()
n = n.join(c)
return n
def summarizeAllStudiesByNumericalField (file = all_info_studies_file, numerical_field = None):
#Return dictionary where each key is one class and each value is a tuple with number of ocurrences and one example
dict = {}
#all_studies_DF = pd.read_csv(root + '/Rx-thorax-automatic-captioning' + file, sep = ';' , header = 0)
all_info_studies_file_prefix = "/all_info_studies"
all_studies_DF = pd.read_csv(root + '/Rx-thorax-automatic-captioning' + all_info_studies_file_prefix + '_nonXNAT.csv', sep = ';' , header = 0)
all_studies_DF = pd.concat([all_studies_DF,pd.read_csv(root + '/Rx-thorax-automatic-captioning' + all_info_studies_file_prefix + '.csv', sep = ';' , header = 0)], sort=False)
all_studies_DF = all_studies_DF.drop_duplicates()
#exclude non-evaluable images
excluded = pd.read_csv('Excluded_images_nonXNAT.csv', sep = ',' , header = 0)
excluded = pd.concat([excluded, pd.read_csv('Excluded_images.csv', sep = ',' , header = 0)],sort = False)
print("Number of excluded studies: " + str(excluded.StudyID.nunique()))
all_studies_DF = all_studies_DF[~all_studies_DF['ImagePath'].isin(excluded['ImagePath'].values)]
idx = all_studies_DF[all_studies_DF['ImagePath'].isin(excluded['ImagePath'].values)].index.values
#all_studies_DF = all_studies_DF.drop(idx)
print("Number of excluded images: " + str(excluded.ImagePath.nunique()))
n = all_studies_DF[numerical_field]
n = pd.to_numeric(n, errors='coerce')
return n.describe()
#summarizeAllStudiesByNumericalField(numerical_field= 'Rows')
def summarizeAllStudies(file = all_info_studies_file):
summary = {}
all_info_studies_file_prefix = "/all_info_studies"
all_studies_DF = pd.read_csv(root + '/Rx-thorax-automatic-captioning' + all_info_studies_file_prefix + '_nonXNAT.csv', sep = ';' , header = 0)
all_studies_DF = pd.concat([all_studies_DF,pd.read_csv(root + '/Rx-thorax-automatic-captioning' + all_info_studies_file_prefix + '.csv', sep = ';' , header = 0)], sort=False)
all_studies_DF = all_studies_DF.drop_duplicates()
print("Number of total studies: " + str(all_studies_DF.StudyID.nunique()))
print("Number of total images: " + str(all_studies_DF.ImagePath.nunique()))
print("Number of total patients: " + str(all_studies_DF.PatientID.nunique()))
#exclude non-evaluable images
excluded = pd.read_csv('Excluded_images_nonXNAT.csv', sep = ',' , header = 0)
excluded = pd.concat([excluded, pd.read_csv('Excluded_images.csv', sep = ',' , header = 0)],sort = False)
print("Number of excluded studies: " + str(excluded.StudyID.nunique()))
all_studies_DF = all_studies_DF[~all_studies_DF['ImagePath'].isin(excluded['ImagePath'].values)]
idx = all_studies_DF[all_studies_DF['ImagePath'].isin(excluded['ImagePath'].values)].index.values
#all_studies_DF = all_studies_DF.drop(idx)
print("Number of excluded images: " + str(excluded.ImagePath.nunique()))
print("Number of excluded patients: " + str(excluded.PatientID.nunique()))
print(excluded.groupby('ImagePath').first().ReasonToExclude.value_counts())
#Number of different images
n = all_studies_DF['ImagePath'].count()
print("Number of curated images: " + str(n))
#Number of different patients
np = all_studies_DF.groupby(['PatientID']).count()
print("Number of patients: " + str(len(np)))
#ax = serie.hist()
#fig = ax.get_figure()
#fig.savefig('PatientNumberDistribution.pdf')
#import matplotlib.pyplot as plt
#s.hist()
#plt.savefig('path/to/figure.pdf') # saves the current figure
#Number of studies per patient (mean, min, max)
ns = all_studies_DF.groupby(['StudyID']).count()
print("Number of curated studies: " + str(len(ns)))
#Number of images per study (mean, min, max)
print("Distribution of # of images per study (mean, min, max): ")
ns = all_studies_DF.groupby(['StudyID'])['ImagePath'].nunique()
print(ns.describe())
ns = all_studies_DF.groupby(['PatientID'])['StudyID'].nunique()
print("Distribution of # of studies per patient (mean, min, max): ")
print(ns.describe())
print("Distribution of # of images per patient (mean, min, max): ")
print(all_studies_DF.groupby(['PatientID'])['ImagePath'].nunique().describe())
patients_with_multiple_studies= ns[ns > 1]
patients_with_multiple_studies.hist()
plt.title("Histogram of Patients with Multiple Studies (n = " + str(patients_with_multiple_studies.count()) + ")" )
plt.xlabel("Number of Studies")
plt.ylabel("Number of Patients")
plt.savefig('graphs/StudiesPerPatientHistogram_all.png')
plt.gcf().clear()
patients_with_multiple_studies.to_csv('Patients_with_multiple_studies_all.csv')
#Birth year (mean, min, max), (distribution by year)
print ('Birth Year Distribution: ')
years = all_studies_DF['PatientBirth']
years = pd.to_numeric(years, errors='coerce')
years[years > 2017] = pd.np.nan #dataset is up to 2017, there are mistakes in the birth year showing the impossible year 2052
print (years.describe())
years.hist()
plt.title("Birth Year Histogram (n = " + str(n) + ")" )
plt.xlabel("Patient's Birth Year")
plt.ylabel("Images")
plt.savefig('graphs/BirthYearHistogram_all.png') # saves the current figure
plt.gcf().clear()
#Age year (mean, min, max), (distribution by year)
print ('Age Distribution: ')
study_dates = all_studies_DF['StudyDate'].apply(lambda x: str(x)[:4])
ages = pd.to_numeric(study_dates, errors='coerce' ) - years
print (ages.describe())
#ages.hist()
freq = pd.Series(ages).value_counts().sort_index()
freq.plot(kind='bar', color='blue', grid=False)
plt.title("Age Histogram (n = " + str(n) + ")" )
plt.xlabel("Patient's Age")
plt.xticks([0,10,20,30,40,50,60,70,80,90,100], [0,10,20,30,40,50,60,70,80,90,100])
plt.ylabel("Images")
plt.savefig('graphs/AgeHistogram_all.png')
plt.gcf().clear()
#Sex
print("Gender Histogram (n = " + str(n) + ")")
sex = all_studies_DF['PatientSex'].dropna()
print(sex.value_counts())
sex = sex.where(sex.isin(['F','M']))
sex.value_counts().plot(kind='bar')
plt.title("Gender Histogram (n = " + str(n) + ")" )
plt.xlabel("Gender")
plt.ylabel("Images")
plt.savefig('graphs/GenderHistogram_all.png')
plt.gcf().clear()
#Sex by age
print("Age by Gender Histogram (n = " + str(n) + ")")
ages_sex = all_studies_DF.join(pd.DataFrame({'PatientAge' :ages}))
ages_sex = ages_sex[ages_sex["PatientSex"].isin(['F','M'])]
table_ages_sex = pd.crosstab(index=ages_sex["PatientAge"], columns=ages_sex["PatientSex"])
table_ages_sex.plot(kind='bar', stacked=False, color=['red','blue'], grid=False)
plt.title("Ages by Gender Histogram (n = " + str(n) + ")" )
plt.xlabel("Ages")
plt.xticks([0,10,20,30,40,50,60,70,80,90,100], [0,10,20,30,40,50,60,70,80,90,100])
plt.ylabel("Images")
plt.savefig('graphs/AgesByGenderHistogram_all.png')
plt.gcf().clear()
pivot = ages_sex.reset_index().pivot(columns='PatientSex', values='PatientAge')
pivot.boxplot(showfliers = True)
plt.title("Gender Ages Boxplot (n = " + str(n) + ")" )
plt.xlabel("Gender")
plt.ylabel("Patient Age")
plt.savefig('graphs/AgesByGenderBoxplot_all.png')
plt.gcf().clear()
#Study date (distribution by year)
print('Study Date Distribution')
study_dates = pd.to_numeric(all_studies_DF['StudyDate'], errors='coerce' ).dropna()
study_dates = study_dates.apply(lambda x: str(x)[:4] + '-' + str(x)[4:6] + '-' + str(x)[6:8]).astype('datetime64[ns]')
years = pd.DatetimeIndex(study_dates).year.values
pd.Series(years).hist()
plt.title("Study Year Histogram (n = " + str(n) + ")" )
plt.xlabel("Year of Study")
plt.ylabel("Images")
plt.savefig('graphs/StudyYearHistogram_all.png')
plt.gcf().clear()
months = pd.DatetimeIndex(study_dates).month.values
month_names = ['JAN', 'FEB', 'MAR', 'APR','MAY','JUN','JUL','AUG','SEP','OCT','NOV','DEC']
freq = pd.Series(months).value_counts().sort_index()
freq.plot(kind='bar')
plt.title("Study Month Histogram (n = " + str(n) + ")" )
plt.xlabel("Month of Study")
plt.xticks([0,1,2,3,4,5,6,7,8,9,10,11], month_names)
plt.ylabel("Images")
plt.savefig('graphs/StudyMonthHistogram_all.png')
plt.gcf().clear()
#Study Type distribution (portatil ['RXSIMPLETORAXPORTATIL1OMASPROYECCIONES','RXTORAXCONPORTATIL'], pediatric ['TORAXPEDIATRICO,'RXTORAXPAYLATPED']
print("Study Type Histogram (n = " + str(n) + ")")
typeS = all_studies_DF['CodeMeaning']
typeS.value_counts().plot(kind='bar')
plt.title("Study Type Histogram (n = " + str(n) + ")" )
plt.xlabel("Study Type")
plt.ylabel("Images")
plt.savefig('graphs/TypeHistogram_all.png')
plt.gcf().clear()
#Radiation exposure levels by study year
print("Radiation exposure levels by study year")
temp = pd.DataFrame()
temp['Year'] = pd.to_numeric(all_studies_DF['StudyDate'], errors='coerce' )
temp['Exposure'] = pd.to_numeric(all_studies_DF['Exposure'], errors='coerce', downcast='integer' )
print(temp['Exposure'].describe())
temp= temp.dropna()
temp = temp[temp['Exposure'] > 0]
temp['Year'] = temp['Year'].apply(lambda x: str(x)[:4] + '-' + str(x)[4:6] + '-' + str(x)[6:8]).astype('datetime64[ns]')
temp['Year'] = pd.DatetimeIndex(temp['Year']).year.values
pivot = temp.reset_index().pivot(columns='Year', values='Exposure')
pivot.boxplot()
plt.title("Radiation Exposure by Study Year (n = " + str(n) + ")" )
plt.xlabel("Years")
plt.ylabel("Radiation Exposure (mAs)")
plt.savefig('graphs/RadiationExposureByYear_all.png')
plt.gcf().clear()
pivot.boxplot(showfliers=False)
plt.title("Radiation Exposure by Study Year (n = " + str(n) + ")" )
plt.xlabel("Years")
plt.ylabel("Radiation Exposure (mAs)")
plt.savefig('graphs/RadiationExposureByYear_NotOutliers_all.png')
plt.gcf().clear()
#Radiation exposure levels by type of Rx
p_cost = pd.read_csv('position_costal_images_nonXNAT.csv')
p_cost = pd.concat([p_cost,pd.read_csv('position_costal_images.csv')],sort=False)
p_cost['Position'] = 'COSTAL'
p_AP_horizontal = pd.read_csv('position_frontAPHorizontal_images_nonXNAT.csv')
p_AP_horizontal = pd.concat([p_AP_horizontal,pd.read_csv('position_frontAPHorizontal_images.csv')],sort=False)
p_AP_horizontal['Position'] = 'AP_horizontal'
p_frontAP = pd.read_csv('position_frontAP_images_nonXNAT.csv')
p_frontAP = pd.concat([p_frontAP,pd.read_csv('position_frontAP_images.csv')],sort=False)
p_frontAP['Position'] = 'AP'
p_frontPA = pd.read_csv('position_frontPA_images_nonXNAT.csv')
p_frontPA = pd.concat([p_frontPA,pd.read_csv('position_frontPA_images.csv')],sort=False)
p_frontPA['Position'] = 'PA'
p_pediatric = pd.read_csv('position_pediatric_images_nonXNAT.csv')
p_pediatric = pd.concat([p_pediatric,pd.read_csv('position_pediatric_images.csv')],sort=False)
p_pediatric['Position'] = 'PED'
p_side_ver_lef = pd.read_csv('position_side_ver_lef_images_nonXNAT.csv')
p_side_ver_lef = pd.concat([p_side_ver_lef,pd.read_csv('position_side_ver_lef_images.csv')],sort=False)
p_side_ver_lef['Position'] = 'L'
p_side_ver_rig = pd.read_csv('position_side_ver_rig_images_nonXNAT.csv')
p_side_ver_rig = pd.concat([p_side_ver_rig,pd.read_csv('position_side_ver_rig_images.csv')],sort=False)
#p_side_ver_rig['Position'] = 'side_ver_rig' #lateral right position are not identificable by any field and therefore they are aggregated in a single group with the left sided x-ray
p_side_ver_rig['Position'] = 'L'
p = [p_cost, p_AP_horizontal, p_frontAP,p_side_ver_rig, p_side_ver_lef, p_pediatric,p_frontPA ]
position = pd.concat(p)
print(position.shape)
print(position.columns)
print("Position Views")
print(position.Position.describe())
n_by_position = position.groupby('Position').apply(lambda x: x.count())
print(n_by_position)
position['Exposure'] = pd.to_numeric(position['Exposure'], errors='coerce', downcast='integer' )
pivot = position.reset_index().pivot(columns='Position', values='Exposure')
pivot.boxplot()
plt.title("Radiation Exposure by Position View")
plt.xlabel("Position View")
plt.ylabel("Radiation Exposure (mAs)")
plt.savefig('graphs/RadiationExposureByPositionView_all.png')
plt.gcf().clear()
pivot = position.reset_index().pivot(columns='Position', values='Exposure')
pivot.boxplot(showfliers = False)
plt.title("Radiation Exposure by Position View")
plt.xlabel("Position View")
plt.ylabel("Radiation Exposure (mAs)")
plt.savefig('graphs/RadiationExposureByPositionViewWithoutOutlayer_all.png')
plt.gcf().clear()
position = pd.concat([position, pd.read_csv('position_toreview_images_nonXNAT.csv'),pd.read_csv('position_toreview_images.csv')], axis = 0)
position.Position.ix[position['Position'].isna()] = 'unk'
position.to_csv('position_view_all.csv', columns = ['Position','ImagePath'])
print(position.Position.describe())
position.Position.value_counts().plot(kind='bar')
plt.title("Position View Histogram (n = " + str(n) + ")" )
plt.xlabel("Position View")
plt.ylabel("Images")
plt.savefig('graphs/PositionViewHistogram_all.png')
plt.gcf().clear()
#RadiationExposureByModality (Digital Rx - DX vs Computer Rx - CR)
print("Radiation exposure levels by modality: Digitalized Rx (DX) vs Computer Rx (CR)")
temp = pd.DataFrame()
temp['Modality'] = all_studies_DF['Modality']
temp['Exposure'] = pd.to_numeric(all_studies_DF['Exposure'], errors='coerce', downcast='integer' )
print(temp['Exposure'].describe())
temp= temp.dropna()
temp = temp[temp['Exposure'] > 0]
pivot = temp.reset_index().pivot(columns='Modality', values='Exposure')
pivot.boxplot()
plt.title("Radiation Exposure by Modality (n = " + str(n) + ")" )
plt.xlabel("Modality")
plt.ylabel("Radiation Exposure (mAs)")
plt.savefig('graphs/RadiationExposureByModality_all.png')
plt.gcf().clear()
pivot.boxplot(showfliers=False)
plt.title("Radiation Exposure by Modality (n = " + str(n) + ")" )
plt.xlabel("Years")
plt.ylabel("Radiation Exposure (mAs)")
plt.savefig('graphs/RadiationExposureByModality_NotOutliers_all.png')
plt.gcf().clear()
#Dynamic Range Distribution by type of Rx (Lateral vs AP)
return summary
#summarizeAllStudies()
def generateMeanAndStdImage(d, position,n_sample):
mean = np.full((400, 400), 0)
for path in d:
path = root + '/SJ' + path
try:
img = img_as_float(io.imread(path))
img = transform.resize(img, (400,400))
#img = np.expand_dims(img, -1)
img = np.uint8(img * 255)
#concatenated_img = np.concatenate((concatenated_img,img), axis=2)
mean = img + mean
except:
print(path)
pass
mean = mean / n_sample
img = exposure.equalize_hist(np.uint8(mean))
io.imsave('graphs/_' + position + '_MeanImage.png',img)
#calculate std
std = np.full((400, 400), 0)
for path in d:
path = root + '/SJ' + path
try:
img = img_as_float(io.imread(path))
img = transform.resize(img, (400,400))
#img = np.expand_dims(img, -1)
img = np.uint16(img * 255)
#concatenated_img = np.concatenate((concatenated_img,img), axis=2)
std = np.power((img - mean),2) + std
except:
print(path)
pass
std = np.sqrt(std/n_sample)
img = exposure.equalize_hist(std)
io.imsave('graphs/_' + position + '_StdImage.png',img)
return
def generateMeanAndStdImageByEachPosition(n_sample = 500):
df = pd.read_csv('all_info_studies_labels_160K.csv', header = 0)
print(df.Review.value_counts())
for position in (set(df.Review.value_counts().index.values) - set(['EXCLUDE'])):
d = df[(df.Review == position) & (df.Pediatric != 'PED' )]
d = d.sample(n = n_sample,random_state=3)
d = d.ImagePath.values
generateMeanAndStdImage(d,position, n_sample)
#pediatric mean and std image
print(df.Pediatric.value_counts())
d = df[(df.Review != 'L') & (df.Pediatric == 'PED' )]
d = d.ImagePath.values
generateMeanAndStdImage(d,'PED', d.shape[0])
def getAllInfoPatient(patient_ID_XNAT= False , patient_ID=False):
pd.set_option('display.max_colwidth', -1)
all_studies_DF = pd.read_csv(root + '/Rx-thorax-automatic-captioning' + all_info_studies_file , sep = ';' , header = 0)
rows = None
if patient_ID_XNAT:
rows = all_studies_DF.loc[all_studies_DF['StudyID'] == patient_ID_XNAT]
patient_ID = rows.iloc[0]['PatientID']
studies = all_studies_DF.loc[all_studies_DF['PatientID'] == patient_ID]
#studies.loc[:,'StudyDate'] =studies.loc[:,'StudyDate'].apply(pd.to_numeric) //FIX ordering not working
#studies.sort_values(by='StudyDate', ascending=True)
return studies
def generateExcludedImageList(fileSuffix = ".csv"): #fileSuffix is used to choose both the source and save file names.
#possible values: _nonXNAT.csv, .csv
all_info_studies_file_prefix = "/all_info_studies"
all_info_studies_file = all_info_studies_file_prefix + fileSuffix
all_studies_DF = pd.read_csv(root + '/Rx-thorax-automatic-captioning' + all_info_studies_file , sep = ';' , header = 0)
#By Modality: exclude CT or None
exclude = all_studies_DF[all_studies_DF['Modality'].isin(['None', 'CT'])]
exclude.loc[:,'ReasonToExclude'] = 'CT'
#By BodyPartExamined [‘ABDOMEN’, ‘LOWEXM’, ‘RXESQUELETOPER’,’SERIEOSEA’]
images = all_studies_DF[all_studies_DF['BodyPartExamined'].isin(['ABDOMEN', 'LOWEXM', 'RXESQUELETOPER','SERIEOSEA'])]
images.loc[:,'ReasonToExclude'] = 'BodyPartExamined'
exclude= exclude.append(images)
#By Series Description: exclude
series_description = ['LatHorizontal','None', 'Oblicuo', 'Transtorácico']
images = all_studies_DF[all_studies_DF['SeriesDescription'].isin(series_description)]
images.loc[:,'ReasonToExclude'] = 'SeriesDescription'
exclude= exclude.append(images)
#By Protocol Name: exclude
protocol_name = ['CHEST1VIE','None','TORAXABDOMENThorax','CHEST1VIECHEST1VIEABDOMEN2VIES', 'Esternón', 'Húmero', 'Sternón']
images = all_studies_DF[all_studies_DF['ProtocolName'].isin(protocol_name)]
images.loc[:,'ReasonToExclude'] = 'ProtocolName'
exclude= exclude.append(images)
#By Photometric Interpretation: exclude. Reason: many of them are DICOM annotation error, so this is necessary to reliable identify them to invert them in the preprocesing
photometric_interpretation = ['MONOCHROME1']
images = all_studies_DF[all_studies_DF['PhotometricInterpretation'].isin(photometric_interpretation)]
images.loc[:,'ReasonToExclude'] = 'MONOCHROME1'
exclude= exclude.append(images)
#Images without reports
print("excluding images without reports..")
images = all_studies_DF[all_studies_DF['Report'] == 'None']
images.loc[:,'ReasonToExclude'] = 'NoReport'
exclude= exclude.append(images)
print(exclude.shape)
nr = '\n'.join(images.groupby('StudyID').groups.keys())
f = open("Studies_without_reports" + fileSuffix, 'w')
f.write(nr)
f.close()
#Images that have failed to be preprocesses and are not in image_dir_preprocessed
#i = all_studies_DF.ImagePath.apply(lambda x : not os.path.exists(root + '/SJ' + x))
print("excluding non processed images..")
image_dir_processed = pathlib.Path(root + '/SJ'+ '/image_dir_processed')
savedImagePaths = list(image_dir_processed.glob("*.png"))
images = all_studies_DF[~all_studies_DF['ImagePath'].isin([ '/image_dir_processed/'+ it.name for it in savedImagePaths])]
#images = all_studies_DF[ all_studies_DF.ImagePath.apply(lambda x : not os.path.exists(root + '/SJ' + x))]
images.to_csv("Not_processed_images" + fileSuffix)
images.loc[:,'ReasonToExclude'] = 'ImageNotProcessed'
exclude= exclude.append(images)
#Images with less than expected number of rows or columns
print("excluding images with less than expected number of rows or columns..")
columns = pd.to_numeric(all_studies_DF.Columns, errors='coerce')
rows = pd.to_numeric(all_studies_DF.Rows, errors='coerce')
images = all_studies_DF[(columns < 500) | (rows < 500)]
print(images.shape[0])
images.loc[:, 'ReasonToExclude'] = 'LowRowsOrColumns'
exclude= exclude.append(images)
exclude.to_csv("Excluded_images_redundant" + fileSuffix)
exclude.groupby('ImagePath').first().to_csv("Excluded_images" + fileSuffix)
return exclude
def preprocess_images( study_ids = None):
path = root + '/SJ'
ConstPixelDims = None
ConstPixelSpacing = None
for i, study_id in enumerate(study_ids):
images = getDicomInfoStudy(study_id)
for image in images.items():
RefDs = image[1]
filename = image[0]
try:
# Load dimensions based on the number of rows, columns
ConstPixelDims = (int(RefDs.Rows), int(RefDs.Columns))
# Load spacing values (in mm)
if hasattr(RefDs, 'PixelSpacing'):
ConstPixelSpacing = (float(RefDs.PixelSpacing[0]), float(RefDs.PixelSpacing[1]))
x = np.arange(0.0, (ConstPixelDims[0]+1)*ConstPixelSpacing[0], ConstPixelSpacing[0])
y = np.arange(0.0, (ConstPixelDims[1]+1)*ConstPixelSpacing[1], ConstPixelSpacing[1])
# The array is sized based on 'ConstPixelDims'
ArrayDicom = np.zeros(ConstPixelDims, dtype=RefDs.pixel_array.dtype)
# store the raw image data
ArrayDicom[:, :] = RefDs.pixel_array
#img = 1.0 - ArrayDicom * 1. / 4096
#img = exposure.equalize_hist(img)
#io.imsave(path + '/image_dir_processed/inverted_' + filename[-10:-4] + '.png', img)
#Testing
min = ArrayDicom.min()
max = ArrayDicom.max()
print ("ArrayDicom Mean: " + str(ArrayDicom.mean()))
print ("Window Center: " + str(RefDs.WindowCenter))
print ("ArrayDicom Min: " + str(min))
print ("ArrayDicom Max: " + str(max))
print ("Window Width: " + str(RefDs.WindowWidth))
cp = np.copy(ArrayDicom)
nmin = RefDs.WindowCenter - 0.5 - (RefDs.WindowWidth -1)/2
nmax = RefDs.WindowCenter - 0.5 + (RefDs.WindowWidth -1)/2
cp[ArrayDicom <= nmin] = min
cp[ArrayDicom > nmax ] = max
temp = ((ArrayDicom - (RefDs.WindowCenter - 0.5))/ (RefDs.WindowWidth -1) + 0.5) * (max-min) + min
cp[ (ArrayDicom <= nmax) & (ArrayDicom > nmin)] = temp[(ArrayDicom <= nmax) & (ArrayDicom > nmin)]
min = cp.min()
max = cp.max()
print ("ProcessedDicom Mean: " + str(cp.mean()))
print ("ProcessedDicom Min: " + str(min))
print ("ProcessedDicom Max: " + str(max))
img = cp * 1. / 4096
#img = exposure.equalize_hist(img)
io.imsave(path + '/image_dir_test/' + study_id + '_' + filename[-10:-4] + '.png', img)
print ('Lung', i, filename)
except:
print ("Unexpected error:", sys.exc_info())
#Temporal method: generate a new file ("all_info_studies_st_file") where each row from all_info_study file is splitted in different rows (one for each sentence)
# adding the unsupervised label topic generated with para2vec ("source_topic_file")
def saveAllStudyTopicsFullDataset(save_file = all_info_studies_st_file, source_topic_file = None, study_file = all_info_studies_file):
topics = '/sentence_clusters_100.csv' if not source_topic_file else source_topic_file
path = root + '/Rx-thorax-automatic-captioning' + topics
sent_topics = pd.read_csv(path, sep = ',', header = 0, names =['key','class', 'ReportID','text'] )
path = root + '/Rx-thorax-automatic-captioning' + study_file
all_studies_DF = pd.read_csv(path, sep = ';' , header = 0)
all_studies_DF.ReportID = all_studies_DF.ReportID.astype(str)
sent_topics.ReportID = sent_topics.ReportID.astype(str)
sent_topics['class'] = sent_topics['class'].astype(str)
merge = pd.merge( all_studies_DF, sent_topics, how='left', on= 'ReportID')
merge.to_csv(save_file)
#Temporal method: generate file for manual review of topics (review_sent_topic_#) where each row is a sentence with its unsupervised topic and the count of occurences
#The total number of rows # is limited to first 1000 more frequent sentences
#The generated file is ready for manual review of topics in order to help add manually reviewed labels
def extract_sent_topics(save_file = extract_topics_file , source_topic_file = None ):
topics = '/sentence_clusters.csv' if not source_topic_file else source_topic_file
path = root + '/Rx-thorax-automatic-captioning' + topics
sent_topics = pd.read_csv(path, sep = ',', header = 0, names =['key','topic', 'ReportID','text'] )
#unique sentences
sent_topics.text = sent_topics.text.astype(str)
unique_sentences = sent_topics.text.value_counts()
print(unique_sentences.head())
table = sent_topics.groupby(['text','topic'], as_index=False)
df = pd.DataFrame(table.size(), columns = ['counts'])
manual_review_sent_topic_1000 = df.sort_values(by = ['counts'], ascending = False)[:1000]
manual_review_sent_topic_1000.to_csv('manual_review/review_sent_topic_1000.csv')
#Temporal method: generate file for manual review of topics (review_sent_topic_pending)
# where each row is a sentence with its unsupervised topic and the count of occurences
#The total number of rows # is limited to sentences still not labeled
# (i.e those sentences in all_info_studies_st_file not labeled)
#The generated file is ready for manual review of topics
def pending_sent_labels(source_label_file = None):
topics = '/manual_review/reviewed_sent_topic.csv' if not source_label_file else source_label_file
path = root + '/Rx-thorax-automatic-captioning' + topics
sent_labels = pd.read_csv(path, sep = ';', header = 0 )
sent_labels['labels'] = np.where(sent_labels['Unnamed: 4'].isnull(),sent_labels['Unnamed: 3'] ,sent_labels['Unnamed: 3'] + ',' + sent_labels['Unnamed: 4'])
all_labels = pd.Series([label for sent in sent_labels['labels'] for label in str(sent).split(',') ])
all_labels.value_counts().sort_index().to_csv('manual_review/unique_labels.csv')
path = root + '/Rx-thorax-automatic-captioning/' + all_info_studies_st_file
all_studies_DF = pd.read_csv(path, sep = ',' , header = 0)
merge = pd.merge( all_studies_DF, sent_labels, how='left', on= 'text')
table = merge[merge['Unnamed: 3'].isnull()]
table.text = table.text.astype(str)
table = table.groupby(['text','class'], as_index=False)
df = pd.DataFrame(table.size(), columns = ['counts'])
df.to_csv('manual_review/review_sent_topic_100_pending.csv')
print(merge.describe())
def merge_labeled_files():
most_frequent_labeled_sentences = '/manual_review/reviewed_sent_topic_1000.csv'
remaining_labeled_sentences = '/manual_review/reviewed_sent_topic_imgdataset.csv'
path = root + '/Rx-thorax-automatic-captioning'
sent_labels_mf = pd.read_csv(path + most_frequent_labeled_sentences, sep = ',', header = 0 )
sent_labels_rem = pd.read_csv(path + remaining_labeled_sentences, sep = ',', header = 0 )
sent_labels_rem.rename(columns={'class': 'topic'}, inplace=True)
sent_labels = pd.concat([sent_labels_mf,sent_labels_rem], axis=0, ignore_index=True)
sent_labels.rename(columns={'Unnamed: 3': '1','Unnamed: 4': '2','Unnamed: 5': '3','Unnamed: 6': '4','Unnamed: 7': '5','Unnamed: 8': '6', 'Unnamed: 9': '7','Unnamed: 10': '8','Unnamed: 11': '9'}, inplace=True)
print(sent_labels.sample(5))
column_names = ['text','topic', 'counts']
column_names.extend(list('123456789'))
sent_labels = sent_labels[column_names]
sent_labels.to_csv('manual_review/labeled_sent_28K.csv')
def extract_loc_matches(x, local_regex):
local_regex['matches'] = local_regex['regex'].apply(lambda pattern: ', '.join(re.findall(pattern, str(x))))
m = ['loc ' + i.strip() for i in local_regex[local_regex['matches'] != '']['lab'].values]
return m
def remove_redundant(x):
redundants = []
for i in x:
for j in x:
if j != i and j in i:
redundants.append(j)
result = list(set(x) - set(redundants))
return result