From a72bc50d24bea4fa4fa44eea3c340daf76adab9b Mon Sep 17 00:00:00 2001 From: Katy Scott Date: Fri, 16 Jun 2023 13:15:31 -0400 Subject: [PATCH 1/9] Add .Dcm handling to crawl_one --- imgtools/utils/crawl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/imgtools/utils/crawl.py b/imgtools/utils/crawl.py index 666fb4d..d0dcdc8 100644 --- a/imgtools/utils/crawl.py +++ b/imgtools/utils/crawl.py @@ -14,7 +14,7 @@ def crawl_one(folder): database = {} for path, _, _ in os.walk(folder): # find dicoms - dicoms = glob.glob(pathlib.Path(path, "**", "*.dcm").as_posix(), recursive=True) + dicoms = glob.glob(pathlib.Path(path, "**", "*.[Dd]cm").as_posix(), recursive=True) # print('\n', folder, dicoms) # instance (slice) information for dcm in dicoms: From a0fe3a388e3591fff7dfd5e69c4ba367b7c685a9 Mon Sep 17 00:00:00 2001 From: Katy Scott Date: Fri, 16 Jun 2023 14:53:34 -0400 Subject: [PATCH 2/9] Fix crawl overwriting patient key in database_dict --- imgtools/utils/crawl.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/imgtools/utils/crawl.py b/imgtools/utils/crawl.py index d0dcdc8..8be77d7 100644 --- a/imgtools/utils/crawl.py +++ b/imgtools/utils/crawl.py @@ -182,13 +182,18 @@ def crawl(top, database_list = [] folders = glob.glob(pathlib.Path(top, "*").as_posix()) + # This is a list of dictionaries, each dictionary is a directory containing image dirs database_list = Parallel(n_jobs=n_jobs)(delayed(crawl_one)(pathlib.Path(top, folder).as_posix()) for folder in tqdm(folders)) - # convert list to dictionary + # convert list of dictionaries to single dictionary with each key being a patient ID database_dict = {} for db in database_list: for key in db: - database_dict[key] = db[key] + # If multiple directories have same patient ID, merge their information together + if key in database_dict: + database_dict[key] = database_dict[key] | db[key] + else: + database_dict[key] = db[key] # save one level above imaging folders parent, dataset = os.path.split(top) From 94e521c45c1ebfb5a947a9229da6e47e081e88db Mon Sep 17 00:00:00 2001 From: Katy Scott Date: Fri, 16 Jun 2023 15:13:28 -0400 Subject: [PATCH 3/9] Updated dictionary merge to work for Python 3.5 or greater --- imgtools/utils/crawl.py | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/imgtools/utils/crawl.py b/imgtools/utils/crawl.py index 8be77d7..dcf4385 100644 --- a/imgtools/utils/crawl.py +++ b/imgtools/utils/crawl.py @@ -191,7 +191,7 @@ def crawl(top, for key in db: # If multiple directories have same patient ID, merge their information together if key in database_dict: - database_dict[key] = database_dict[key] | db[key] + database_dict[key] = {**database_dict[key], **db[key]} else: database_dict[key] = db[key] @@ -218,15 +218,16 @@ def crawl(top, return database_dict if __name__ == "__main__": - parser = ArgumentParser("Dataset DICOM Crawler") - parser.add_argument("directory", - type=str, - help="Top-level directory of the dataset.") - parser.add_argument("--n_jobs", - type=int, - default=16, - help="Number of parallel processes for multiprocessing.") - - args = parser.parse_args() - db = crawl(args.directory, n_jobs=args.n_jobs) + # parser = ArgumentParser("Dataset DICOM Crawler") + # parser.add_argument("directory", + # type=str, + # help="Top-level directory of the dataset.") + # parser.add_argument("--n_jobs", + # type=int, + # default=16, + # help="Number of parallel processes for multiprocessing.") + + # args = parser.parse_args() + # db = crawl(args.directory, n_jobs=args.n_jobs) + db = crawl('/Users/katyscott/Documents/SARC021/images', -1) print("# patients:", len(db)) From d4437918a8d99d53c3666f59ade70d7bb01b7cd6 Mon Sep 17 00:00:00 2001 From: Katy Scott Date: Fri, 16 Jun 2023 15:14:18 -0400 Subject: [PATCH 4/9] Fixed main call --- imgtools/utils/crawl.py | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/imgtools/utils/crawl.py b/imgtools/utils/crawl.py index dcf4385..918ca54 100644 --- a/imgtools/utils/crawl.py +++ b/imgtools/utils/crawl.py @@ -218,16 +218,15 @@ def crawl(top, return database_dict if __name__ == "__main__": - # parser = ArgumentParser("Dataset DICOM Crawler") - # parser.add_argument("directory", - # type=str, - # help="Top-level directory of the dataset.") - # parser.add_argument("--n_jobs", - # type=int, - # default=16, - # help="Number of parallel processes for multiprocessing.") - - # args = parser.parse_args() - # db = crawl(args.directory, n_jobs=args.n_jobs) - db = crawl('/Users/katyscott/Documents/SARC021/images', -1) + parser = ArgumentParser("Dataset DICOM Crawler") + parser.add_argument("directory", + type=str, + help="Top-level directory of the dataset.") + parser.add_argument("--n_jobs", + type=int, + default=16, + help="Number of parallel processes for multiprocessing.") + + args = parser.parse_args() + db = crawl(args.directory, n_jobs=args.n_jobs) print("# patients:", len(db)) From 4721be33f9e6e31102c9b4356f2387c6c1aeb113 Mon Sep 17 00:00:00 2001 From: Katy Scott Date: Wed, 26 Jul 2023 15:24:32 -0400 Subject: [PATCH 5/9] Handling RTSTRUCT containing ROIs for mulitple CTs --- imgtools/utils/crawl.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/imgtools/utils/crawl.py b/imgtools/utils/crawl.py index 918ca54..c8b7130 100644 --- a/imgtools/utils/crawl.py +++ b/imgtools/utils/crawl.py @@ -2,6 +2,7 @@ import os, pathlib import glob import json +from copy import deepcopy import pandas as pd from pydicom import dcmread @@ -57,6 +58,15 @@ def crawl_one(folder): reference_ct = str(meta.ReferencedImageSequence[0].ReferencedSOPInstanceUID) except: pass + try: + # If the RTSTRUCT has contours for multiple CT images, get a list all of the reference IDs for the CTs + refSOPInstances = meta.ReferencedFrameOfReferenceSequence[0].RTReferencedStudySequence[0].RTReferencedSeriesSequence[0].ContourImageSequence + reference_ct = [] + if len(refSOPInstances) > 1: + for idx in range(len(refSOPInstances)): + reference_ct.append(refSOPInstances[idx].ReferencedSOPInstanceUID) + except: + pass try: reference_pl = str(meta.ReferencedRTPlanSequence[0].ReferencedSOPInstanceUID) except: @@ -139,6 +149,13 @@ def crawl_one(folder): 'imaged_nucleus': elem, 'fname': rel_path.as_posix() #temporary until we switch to json-based loading } + # If there are multiple CTs referenced for this segmentation, make an RTSTRUCT instance/row for each CT ID as different acquisition/subseries (name pending) + if isinstance(reference_ct, list): + database[patient][study][series]["default"]["reference_ct"] = reference_ct[0] + for n, ct_id in enumerate(reference_ct[1:]): + database[patient][study][series][f"{subseries}_{n+1}"] = deepcopy(database[patient][study][series]["default"]) + database[patient][study][series][f"{subseries}_{n+1}"]["reference_ct"] = ct_id + database[patient][study][series][subseries]['instances'][instance] = rel_path.as_posix() except Exception as e: print(folder, e) From 7d2b919b2059eed97581d012508114302cec64b7 Mon Sep 17 00:00:00 2001 From: Katy Scott Date: Wed, 2 Aug 2023 13:35:45 -0400 Subject: [PATCH 6/9] Dropped the file name from folder for RTSTRUCT output --- imgtools/utils/crawl.py | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/imgtools/utils/crawl.py b/imgtools/utils/crawl.py index c8b7130..53edf00 100644 --- a/imgtools/utils/crawl.py +++ b/imgtools/utils/crawl.py @@ -127,8 +127,8 @@ def crawl_one(folder): database[patient][study] = {'description': study_description} if series not in database[patient][study]: rel_crawl_path = rel_posix - if meta.Modality == 'RTSTRUCT': - rel_crawl_path = os.path.join(rel_crawl_path, fname) + # if meta.Modality == 'RTSTRUCT': + # rel_crawl_path = os.path.join(rel_crawl_path, fname) database[patient][study][series] = {'description': series_description} if subseries not in database[patient][study][series]: @@ -212,6 +212,21 @@ def crawl(top, else: database_dict[key] = db[key] + # INSERT CHECK FOR EMPTY REFERENCE CTs HERE + # Find entries with empty reference_ct (dictionary or dataframe?) + # Should be an RTSTRUCT + # Find the CTs that are in the same patient directory + # Look through these for the SOPInstanceUID from the RTSTRUCT + # Record the real reference_ct ID + # OR + # Look for entries where the reference_ct does not match any series + # Then, find the CTs that are in the same patient directory + # Look through these for the SOPInstanceUID from the RTSTRUCT + # Record the real reference_ct ID + # database_df = to_df(database_dict) + + + # save one level above imaging folders parent, dataset = os.path.split(top) @@ -225,6 +240,7 @@ def crawl(top, # save as json with open(pathlib.Path(parent_imgtools, f'imgtools_{dataset}.json').as_posix(), 'w') as f: + # Can I change this to saving a dataframe instead json.dump(database_dict, f, indent=4) # save as dataframe @@ -232,7 +248,8 @@ def crawl(top, df_path = pathlib.Path(parent_imgtools, f'imgtools_{dataset}.csv').as_posix() df.to_csv(df_path) - return database_dict + # KATY YOU HAVE TO REMOVE THE DF HERE + return database_dict, df if __name__ == "__main__": parser = ArgumentParser("Dataset DICOM Crawler") From 2976574007e4536589ee77a503ba24ea7ee95df4 Mon Sep 17 00:00:00 2001 From: Katy Scott Date: Wed, 2 Aug 2023 16:54:49 -0400 Subject: [PATCH 7/9] Added working function to get missing reference ct values --- imgtools/utils/crawl.py | 81 ++++++++++++++++++++++++++++------------- 1 file changed, 56 insertions(+), 25 deletions(-) diff --git a/imgtools/utils/crawl.py b/imgtools/utils/crawl.py index 53edf00..4578885 100644 --- a/imgtools/utils/crawl.py +++ b/imgtools/utils/crawl.py @@ -10,6 +10,54 @@ from joblib import Parallel, delayed +def findMissingCTReference(database_df, folder): + # Find rows in the dataset dataframe that are for RTSTRUCTS with missing reference CT values + missingRefCTs = database_df[(database_df['reference_ct'] == '') & (database_df['modality'] == 'RTSTRUCT')] + database_df = database_df.drop(missingRefCTs.index) + + for idx, rt in missingRefCTs.iterrows(): + rt_path = os.path.join(os.path.dirname(folder), rt['file_path']) + # Load the RTSTRUCT again + rt_meta = dcmread(rt_path, force=True, stop_before_pixels=True) + # Get any reference SOP Instances from the RTSTRUCT - these will be individual slices in the CT they correspond to + refSOPInstances = rt_meta.ReferencedFrameOfReferenceSequence[0].RTReferencedStudySequence[0].RTReferencedSeriesSequence[0].ContourImageSequence + reference_ct_list = [] + if len(refSOPInstances) > 0: + for idx in range(len(refSOPInstances)): + reference_ct_list.append(refSOPInstances[idx].ReferencedSOPInstanceUID) + + # Get a new dataframe with rows for each CT reference + updatedRTRows = pd.concat([missingRefCTs.iloc[[0]]]*len(refSOPInstances)) + updatedRTRows.reset_index(drop=True, inplace=True) + + # Get any CTs for the patient the RTSTRUCT is from + cts = database_df[(database_df['patient_ID'] == rt['patient_ID']) & (database_df['modality'] == 'CT')] + + # Check each CT to see if it has the slice with the SOP in it + for ct in cts.itertuples(): + if reference_ct_list == []: + print("All associations found. Exiting search.") + break + ct_path = os.path.join(os.path.dirname(folder), ct.folder) + dicoms = glob.glob(pathlib.Path(ct_path, "**", "*.[Dd]cm").as_posix(), recursive=True) + for dcm in dicoms: + ct_meta = dcmread(dcm, specific_tags=['SOPInstanceUID', 'SeriesInstanceUID']) + if ct_meta.SOPInstanceUID in reference_ct_list: + print(ct_meta.SOPInstanceUID, "is in", ct_meta.SeriesInstanceUID) + updatedRTRows.at[len(reference_ct_list)-1, 'reference_ct'] = ct_meta.SeriesInstanceUID + reference_ct_list.remove(ct_meta.SOPInstanceUID) + break + + if reference_ct_list != []: + print("Some associations not found.") + + database_df = pd.concat([database_df, updatedRTRows], ignore_index=True) + database_df.sort_values(by=['patient_ID', 'folder'], inplace=True) + database_df.reset_index(drop=True, inplace=True) + + return database_df + + def crawl_one(folder): folder_path = pathlib.Path(folder) database = {} @@ -58,15 +106,6 @@ def crawl_one(folder): reference_ct = str(meta.ReferencedImageSequence[0].ReferencedSOPInstanceUID) except: pass - try: - # If the RTSTRUCT has contours for multiple CT images, get a list all of the reference IDs for the CTs - refSOPInstances = meta.ReferencedFrameOfReferenceSequence[0].RTReferencedStudySequence[0].RTReferencedSeriesSequence[0].ContourImageSequence - reference_ct = [] - if len(refSOPInstances) > 1: - for idx in range(len(refSOPInstances)): - reference_ct.append(refSOPInstances[idx].ReferencedSOPInstanceUID) - except: - pass try: reference_pl = str(meta.ReferencedRTPlanSequence[0].ReferencedSOPInstanceUID) except: @@ -212,20 +251,12 @@ def crawl(top, else: database_dict[key] = db[key] - # INSERT CHECK FOR EMPTY REFERENCE CTs HERE - # Find entries with empty reference_ct (dictionary or dataframe?) - # Should be an RTSTRUCT - # Find the CTs that are in the same patient directory - # Look through these for the SOPInstanceUID from the RTSTRUCT - # Record the real reference_ct ID - # OR - # Look for entries where the reference_ct does not match any series - # Then, find the CTs that are in the same patient directory - # Look through these for the SOPInstanceUID from the RTSTRUCT - # Record the real reference_ct ID - # database_df = to_df(database_dict) - - + # Checking for empty reference CT values - this works! + database_df = to_df(database_dict) + missingRefCTs = database_df[(database_df['reference_ct'] == '') & (database_df['modality'] == 'RTSTRUCT')] + if not missingRefCTs.empty: + df = findMissingCTReference(database_df, top) + # save one level above imaging folders parent, dataset = os.path.split(top) @@ -238,6 +269,7 @@ def crawl(top, except: pass + # TODO: update this to save out the database_df instead of the dict # save as json with open(pathlib.Path(parent_imgtools, f'imgtools_{dataset}.json').as_posix(), 'w') as f: # Can I change this to saving a dataframe instead @@ -248,8 +280,7 @@ def crawl(top, df_path = pathlib.Path(parent_imgtools, f'imgtools_{dataset}.csv').as_posix() df.to_csv(df_path) - # KATY YOU HAVE TO REMOVE THE DF HERE - return database_dict, df + return database_dict if __name__ == "__main__": parser = ArgumentParser("Dataset DICOM Crawler") From d9c562152ce8a667b5179767c55b039844098363 Mon Sep 17 00:00:00 2001 From: Katy Scott Date: Thu, 3 Aug 2023 11:09:59 -0400 Subject: [PATCH 8/9] Remove second df conversion --- imgtools/utils/crawl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/imgtools/utils/crawl.py b/imgtools/utils/crawl.py index 4578885..5ad168c 100644 --- a/imgtools/utils/crawl.py +++ b/imgtools/utils/crawl.py @@ -276,7 +276,7 @@ def crawl(top, json.dump(database_dict, f, indent=4) # save as dataframe - df = to_df(database_dict) + # df = to_df(database_dict) df_path = pathlib.Path(parent_imgtools, f'imgtools_{dataset}.csv').as_posix() df.to_csv(df_path) From 56b09398fb8b0eee71de7fbbfd31a20306f12864 Mon Sep 17 00:00:00 2001 From: Katy Scott Date: Thu, 3 Aug 2023 16:15:22 -0400 Subject: [PATCH 9/9] added file name back to RTSTRUCT folder listing --- imgtools/utils/crawl.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/imgtools/utils/crawl.py b/imgtools/utils/crawl.py index 5ad168c..6513395 100644 --- a/imgtools/utils/crawl.py +++ b/imgtools/utils/crawl.py @@ -16,7 +16,7 @@ def findMissingCTReference(database_df, folder): database_df = database_df.drop(missingRefCTs.index) for idx, rt in missingRefCTs.iterrows(): - rt_path = os.path.join(os.path.dirname(folder), rt['file_path']) + rt_path = os.path.join(os.path.dirname(folder), rt['folder']) # Load the RTSTRUCT again rt_meta = dcmread(rt_path, force=True, stop_before_pixels=True) # Get any reference SOP Instances from the RTSTRUCT - these will be individual slices in the CT they correspond to @@ -26,6 +26,8 @@ def findMissingCTReference(database_df, folder): for idx in range(len(refSOPInstances)): reference_ct_list.append(refSOPInstances[idx].ReferencedSOPInstanceUID) + # reference_ct_list_sample = [i.ReferencedSOPInstanceUID for i in rt_meta.ReferencedFrameOfReferenceSequence[0].RTReferencedStudySequence[0].RTReferencedSeriesSequence[0].ContourImageSequence] + # Get a new dataframe with rows for each CT reference updatedRTRows = pd.concat([missingRefCTs.iloc[[0]]]*len(refSOPInstances)) updatedRTRows.reset_index(drop=True, inplace=True) @@ -111,6 +113,12 @@ def crawl_one(folder): except: pass + # Special metadata + try: + reference_ct_special = [i.ReferencedSOPInstanceUID for i in rt_meta.ReferencedFrameOfReferenceSequence[0].RTReferencedStudySequence[0].RTReferencedSeriesSequence[0].ContourImageSequence] + except: + pass + #MRI Tags try: tr = float(meta.RepetitionTime) @@ -166,8 +174,8 @@ def crawl_one(folder): database[patient][study] = {'description': study_description} if series not in database[patient][study]: rel_crawl_path = rel_posix - # if meta.Modality == 'RTSTRUCT': - # rel_crawl_path = os.path.join(rel_crawl_path, fname) + if meta.Modality == 'RTSTRUCT': + rel_crawl_path = os.path.join(rel_crawl_path, fname) database[patient][study][series] = {'description': series_description} if subseries not in database[patient][study][series]: @@ -188,6 +196,7 @@ def crawl_one(folder): 'imaged_nucleus': elem, 'fname': rel_path.as_posix() #temporary until we switch to json-based loading } + # If there are multiple CTs referenced for this segmentation, make an RTSTRUCT instance/row for each CT ID as different acquisition/subseries (name pending) if isinstance(reference_ct, list): database[patient][study][series]["default"]["reference_ct"] = reference_ct[0]