From a72bc50d24bea4fa4fa44eea3c340daf76adab9b Mon Sep 17 00:00:00 2001
From: Katy Scott <k.l.scott16@gmail.com>
Date: Fri, 16 Jun 2023 13:15:31 -0400
Subject: [PATCH 1/9] Add .Dcm handling to crawl_one

---
 imgtools/utils/crawl.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/imgtools/utils/crawl.py b/imgtools/utils/crawl.py
index 666fb4d..d0dcdc8 100644
--- a/imgtools/utils/crawl.py
+++ b/imgtools/utils/crawl.py
@@ -14,7 +14,7 @@ def crawl_one(folder):
     database = {}
     for path, _, _ in os.walk(folder):
         # find dicoms
-        dicoms = glob.glob(pathlib.Path(path, "**", "*.dcm").as_posix(), recursive=True)
+        dicoms = glob.glob(pathlib.Path(path, "**", "*.[Dd]cm").as_posix(), recursive=True)
         # print('\n', folder, dicoms)
         # instance (slice) information
         for dcm in dicoms:

From a0fe3a388e3591fff7dfd5e69c4ba367b7c685a9 Mon Sep 17 00:00:00 2001
From: Katy Scott <k.l.scott16@gmail.com>
Date: Fri, 16 Jun 2023 14:53:34 -0400
Subject: [PATCH 2/9] Fix crawl overwriting patient key in database_dict

---
 imgtools/utils/crawl.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/imgtools/utils/crawl.py b/imgtools/utils/crawl.py
index d0dcdc8..8be77d7 100644
--- a/imgtools/utils/crawl.py
+++ b/imgtools/utils/crawl.py
@@ -182,13 +182,18 @@ def crawl(top,
     database_list = []
     folders = glob.glob(pathlib.Path(top, "*").as_posix())
     
+    # This is a list of dictionaries, each dictionary is a directory containing image dirs 
     database_list = Parallel(n_jobs=n_jobs)(delayed(crawl_one)(pathlib.Path(top, folder).as_posix()) for folder in tqdm(folders))
 
-    # convert list to dictionary
+    # convert list of dictionaries to single dictionary with each key being a patient ID
     database_dict = {}
     for db in database_list:
         for key in db:
-            database_dict[key] = db[key]
+            # If multiple directories have same patient ID, merge their information together
+            if key in database_dict:
+                database_dict[key] = database_dict[key] | db[key]
+            else:
+                database_dict[key] = db[key]
     
     # save one level above imaging folders
     parent, dataset  = os.path.split(top)

From 94e521c45c1ebfb5a947a9229da6e47e081e88db Mon Sep 17 00:00:00 2001
From: Katy Scott <k.l.scott16@gmail.com>
Date: Fri, 16 Jun 2023 15:13:28 -0400
Subject: [PATCH 3/9] Updated dictionary merge to work for Python 3.5 or
 greater

---
 imgtools/utils/crawl.py | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/imgtools/utils/crawl.py b/imgtools/utils/crawl.py
index 8be77d7..dcf4385 100644
--- a/imgtools/utils/crawl.py
+++ b/imgtools/utils/crawl.py
@@ -191,7 +191,7 @@ def crawl(top,
         for key in db:
             # If multiple directories have same patient ID, merge their information together
             if key in database_dict:
-                database_dict[key] = database_dict[key] | db[key]
+                database_dict[key] = {**database_dict[key], **db[key]}
             else:
                 database_dict[key] = db[key]
     
@@ -218,15 +218,16 @@ def crawl(top,
     return database_dict
 
 if __name__ == "__main__":
-    parser = ArgumentParser("Dataset DICOM Crawler")
-    parser.add_argument("directory",
-                         type=str,
-                         help="Top-level directory of the dataset.")
-    parser.add_argument("--n_jobs",
-                         type=int,
-                         default=16,
-                         help="Number of parallel processes for multiprocessing.")
-
-    args = parser.parse_args()
-    db = crawl(args.directory, n_jobs=args.n_jobs)
+    # parser = ArgumentParser("Dataset DICOM Crawler")
+    # parser.add_argument("directory",
+    #                      type=str,
+    #                      help="Top-level directory of the dataset.")
+    # parser.add_argument("--n_jobs",
+    #                      type=int,
+    #                      default=16,
+    #                      help="Number of parallel processes for multiprocessing.")
+
+    # args = parser.parse_args()
+    # db = crawl(args.directory, n_jobs=args.n_jobs)
+    db = crawl('/Users/katyscott/Documents/SARC021/images', -1)
     print("# patients:", len(db))

From d4437918a8d99d53c3666f59ade70d7bb01b7cd6 Mon Sep 17 00:00:00 2001
From: Katy Scott <k.l.scott16@gmail.com>
Date: Fri, 16 Jun 2023 15:14:18 -0400
Subject: [PATCH 4/9] Fixed main call

---
 imgtools/utils/crawl.py | 23 +++++++++++------------
 1 file changed, 11 insertions(+), 12 deletions(-)

diff --git a/imgtools/utils/crawl.py b/imgtools/utils/crawl.py
index dcf4385..918ca54 100644
--- a/imgtools/utils/crawl.py
+++ b/imgtools/utils/crawl.py
@@ -218,16 +218,15 @@ def crawl(top,
     return database_dict
 
 if __name__ == "__main__":
-    # parser = ArgumentParser("Dataset DICOM Crawler")
-    # parser.add_argument("directory",
-    #                      type=str,
-    #                      help="Top-level directory of the dataset.")
-    # parser.add_argument("--n_jobs",
-    #                      type=int,
-    #                      default=16,
-    #                      help="Number of parallel processes for multiprocessing.")
-
-    # args = parser.parse_args()
-    # db = crawl(args.directory, n_jobs=args.n_jobs)
-    db = crawl('/Users/katyscott/Documents/SARC021/images', -1)
+    parser = ArgumentParser("Dataset DICOM Crawler")
+    parser.add_argument("directory",
+                         type=str,
+                         help="Top-level directory of the dataset.")
+    parser.add_argument("--n_jobs",
+                         type=int,
+                         default=16,
+                         help="Number of parallel processes for multiprocessing.")
+
+    args = parser.parse_args()
+    db = crawl(args.directory, n_jobs=args.n_jobs)
     print("# patients:", len(db))

From 4721be33f9e6e31102c9b4356f2387c6c1aeb113 Mon Sep 17 00:00:00 2001
From: Katy Scott <k.l.scott16@gmail.com>
Date: Wed, 26 Jul 2023 15:24:32 -0400
Subject: [PATCH 5/9] Handling RTSTRUCT containing ROIs for mulitple CTs

---
 imgtools/utils/crawl.py | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/imgtools/utils/crawl.py b/imgtools/utils/crawl.py
index 918ca54..c8b7130 100644
--- a/imgtools/utils/crawl.py
+++ b/imgtools/utils/crawl.py
@@ -2,6 +2,7 @@
 import os, pathlib
 import glob
 import json
+from copy import deepcopy
 
 import pandas as pd
 from pydicom import dcmread
@@ -57,6 +58,15 @@ def crawl_one(folder):
                             reference_ct = str(meta.ReferencedImageSequence[0].ReferencedSOPInstanceUID)
                         except:
                             pass
+                        try:
+                            # If the RTSTRUCT has contours for multiple CT images, get a list all of the reference IDs for the CTs
+                            refSOPInstances = meta.ReferencedFrameOfReferenceSequence[0].RTReferencedStudySequence[0].RTReferencedSeriesSequence[0].ContourImageSequence
+                            reference_ct = []
+                            if len(refSOPInstances) > 1:
+                                for idx in range(len(refSOPInstances)):
+                                    reference_ct.append(refSOPInstances[idx].ReferencedSOPInstanceUID)
+                        except:
+                            pass
                         try:
                             reference_pl = str(meta.ReferencedRTPlanSequence[0].ReferencedSOPInstanceUID)
                         except:
@@ -139,6 +149,13 @@ def crawl_one(folder):
                                                                    'imaged_nucleus': elem,
                                                                    'fname': rel_path.as_posix() #temporary until we switch to json-based loading
                                                                    }
+                    # If there are multiple CTs referenced for this segmentation, make an RTSTRUCT instance/row for each CT ID as different acquisition/subseries (name pending)
+                    if isinstance(reference_ct, list):
+                        database[patient][study][series]["default"]["reference_ct"] = reference_ct[0]
+                        for n, ct_id in enumerate(reference_ct[1:]):
+                            database[patient][study][series][f"{subseries}_{n+1}"] = deepcopy(database[patient][study][series]["default"])
+                            database[patient][study][series][f"{subseries}_{n+1}"]["reference_ct"] = ct_id
+                        
                 database[patient][study][series][subseries]['instances'][instance] = rel_path.as_posix()
             except Exception as e:
                 print(folder, e)

From 7d2b919b2059eed97581d012508114302cec64b7 Mon Sep 17 00:00:00 2001
From: Katy Scott <k.l.scott16@gmail.com>
Date: Wed, 2 Aug 2023 13:35:45 -0400
Subject: [PATCH 6/9] Dropped the file name from folder for RTSTRUCT output

---
 imgtools/utils/crawl.py | 23 ++++++++++++++++++++---
 1 file changed, 20 insertions(+), 3 deletions(-)

diff --git a/imgtools/utils/crawl.py b/imgtools/utils/crawl.py
index c8b7130..53edf00 100644
--- a/imgtools/utils/crawl.py
+++ b/imgtools/utils/crawl.py
@@ -127,8 +127,8 @@ def crawl_one(folder):
                     database[patient][study] = {'description': study_description}
                 if series not in database[patient][study]:
                     rel_crawl_path  = rel_posix
-                    if meta.Modality == 'RTSTRUCT':
-                        rel_crawl_path = os.path.join(rel_crawl_path, fname)
+                    # if meta.Modality == 'RTSTRUCT':
+                    #     rel_crawl_path = os.path.join(rel_crawl_path, fname)
                     
                     database[patient][study][series] = {'description': series_description}
                 if subseries not in database[patient][study][series]:
@@ -212,6 +212,21 @@ def crawl(top,
             else:
                 database_dict[key] = db[key]
     
+    # INSERT CHECK FOR EMPTY REFERENCE CTs HERE
+    # Find entries with empty reference_ct (dictionary or dataframe?)
+    # Should be an RTSTRUCT
+    # Find the CTs that are in the same patient directory
+    # Look through these for the SOPInstanceUID from the RTSTRUCT
+    # Record the real reference_ct ID
+    # OR 
+    # Look for entries where the reference_ct does not match any series
+    # Then, find the CTs that are in the same patient directory
+    # Look through these for the SOPInstanceUID from the RTSTRUCT
+    # Record the real reference_ct ID
+    # database_df = to_df(database_dict)
+
+
+
     # save one level above imaging folders
     parent, dataset  = os.path.split(top)
 
@@ -225,6 +240,7 @@ def crawl(top,
     
     # save as json
     with open(pathlib.Path(parent_imgtools, f'imgtools_{dataset}.json').as_posix(), 'w') as f:
+        # Can I change this to saving a dataframe instead
         json.dump(database_dict, f, indent=4)
     
     # save as dataframe
@@ -232,7 +248,8 @@ def crawl(top,
     df_path = pathlib.Path(parent_imgtools, f'imgtools_{dataset}.csv').as_posix()
     df.to_csv(df_path)
     
-    return database_dict
+    # KATY YOU HAVE TO REMOVE THE DF HERE
+    return database_dict, df
 
 if __name__ == "__main__":
     parser = ArgumentParser("Dataset DICOM Crawler")

From 2976574007e4536589ee77a503ba24ea7ee95df4 Mon Sep 17 00:00:00 2001
From: Katy Scott <k.l.scott16@gmail.com>
Date: Wed, 2 Aug 2023 16:54:49 -0400
Subject: [PATCH 7/9] Added working function to get missing reference ct values

---
 imgtools/utils/crawl.py | 81 ++++++++++++++++++++++++++++-------------
 1 file changed, 56 insertions(+), 25 deletions(-)

diff --git a/imgtools/utils/crawl.py b/imgtools/utils/crawl.py
index 53edf00..4578885 100644
--- a/imgtools/utils/crawl.py
+++ b/imgtools/utils/crawl.py
@@ -10,6 +10,54 @@
 
 from joblib import Parallel, delayed
 
+def findMissingCTReference(database_df, folder):
+    # Find rows in the dataset dataframe that are for RTSTRUCTS with missing reference CT values
+    missingRefCTs = database_df[(database_df['reference_ct'] == '') & (database_df['modality'] == 'RTSTRUCT')]
+    database_df = database_df.drop(missingRefCTs.index)
+
+    for idx, rt in missingRefCTs.iterrows():
+        rt_path = os.path.join(os.path.dirname(folder), rt['file_path'])
+        # Load the RTSTRUCT again
+        rt_meta = dcmread(rt_path, force=True, stop_before_pixels=True)
+        # Get any reference SOP Instances from the RTSTRUCT - these will be individual slices in the CT they correspond to
+        refSOPInstances = rt_meta.ReferencedFrameOfReferenceSequence[0].RTReferencedStudySequence[0].RTReferencedSeriesSequence[0].ContourImageSequence
+        reference_ct_list = []
+        if len(refSOPInstances) > 0:
+            for idx in range(len(refSOPInstances)):
+                reference_ct_list.append(refSOPInstances[idx].ReferencedSOPInstanceUID)
+
+        # Get a new dataframe with rows for each CT reference
+        updatedRTRows = pd.concat([missingRefCTs.iloc[[0]]]*len(refSOPInstances))
+        updatedRTRows.reset_index(drop=True, inplace=True)  
+
+        # Get any CTs for the patient the RTSTRUCT is from
+        cts = database_df[(database_df['patient_ID'] == rt['patient_ID']) & (database_df['modality'] == 'CT')]
+        
+        # Check each CT to see if it has the slice with the SOP in it
+        for ct in cts.itertuples():
+            if reference_ct_list == []:
+                print("All associations found. Exiting search.")
+                break
+            ct_path = os.path.join(os.path.dirname(folder), ct.folder)
+            dicoms = glob.glob(pathlib.Path(ct_path, "**", "*.[Dd]cm").as_posix(), recursive=True)
+            for dcm in dicoms:
+                ct_meta = dcmread(dcm, specific_tags=['SOPInstanceUID', 'SeriesInstanceUID'])
+                if ct_meta.SOPInstanceUID in reference_ct_list:
+                    print(ct_meta.SOPInstanceUID, "is in", ct_meta.SeriesInstanceUID)
+                    updatedRTRows.at[len(reference_ct_list)-1, 'reference_ct'] = ct_meta.SeriesInstanceUID
+                    reference_ct_list.remove(ct_meta.SOPInstanceUID)
+                    break
+
+        if reference_ct_list != []:
+            print("Some associations not found.")
+        
+    database_df = pd.concat([database_df, updatedRTRows], ignore_index=True)
+    database_df.sort_values(by=['patient_ID', 'folder'], inplace=True)
+    database_df.reset_index(drop=True, inplace=True)  
+
+    return database_df
+
+
 def crawl_one(folder):
     folder_path = pathlib.Path(folder)
     database = {}
@@ -58,15 +106,6 @@ def crawl_one(folder):
                             reference_ct = str(meta.ReferencedImageSequence[0].ReferencedSOPInstanceUID)
                         except:
                             pass
-                        try:
-                            # If the RTSTRUCT has contours for multiple CT images, get a list all of the reference IDs for the CTs
-                            refSOPInstances = meta.ReferencedFrameOfReferenceSequence[0].RTReferencedStudySequence[0].RTReferencedSeriesSequence[0].ContourImageSequence
-                            reference_ct = []
-                            if len(refSOPInstances) > 1:
-                                for idx in range(len(refSOPInstances)):
-                                    reference_ct.append(refSOPInstances[idx].ReferencedSOPInstanceUID)
-                        except:
-                            pass
                         try:
                             reference_pl = str(meta.ReferencedRTPlanSequence[0].ReferencedSOPInstanceUID)
                         except:
@@ -212,20 +251,12 @@ def crawl(top,
             else:
                 database_dict[key] = db[key]
     
-    # INSERT CHECK FOR EMPTY REFERENCE CTs HERE
-    # Find entries with empty reference_ct (dictionary or dataframe?)
-    # Should be an RTSTRUCT
-    # Find the CTs that are in the same patient directory
-    # Look through these for the SOPInstanceUID from the RTSTRUCT
-    # Record the real reference_ct ID
-    # OR 
-    # Look for entries where the reference_ct does not match any series
-    # Then, find the CTs that are in the same patient directory
-    # Look through these for the SOPInstanceUID from the RTSTRUCT
-    # Record the real reference_ct ID
-    # database_df = to_df(database_dict)
-
-
+    # Checking for empty reference CT values - this works!
+    database_df = to_df(database_dict)
+    missingRefCTs = database_df[(database_df['reference_ct'] == '') & (database_df['modality'] == 'RTSTRUCT')]
+    if not missingRefCTs.empty:
+        df = findMissingCTReference(database_df, top)
+    
 
     # save one level above imaging folders
     parent, dataset  = os.path.split(top)
@@ -238,6 +269,7 @@ def crawl(top,
         except:
             pass
     
+    # TODO: update this to save out the database_df instead of the dict
     # save as json
     with open(pathlib.Path(parent_imgtools, f'imgtools_{dataset}.json').as_posix(), 'w') as f:
         # Can I change this to saving a dataframe instead
@@ -248,8 +280,7 @@ def crawl(top,
     df_path = pathlib.Path(parent_imgtools, f'imgtools_{dataset}.csv').as_posix()
     df.to_csv(df_path)
     
-    # KATY YOU HAVE TO REMOVE THE DF HERE
-    return database_dict, df
+    return database_dict
 
 if __name__ == "__main__":
     parser = ArgumentParser("Dataset DICOM Crawler")

From d9c562152ce8a667b5179767c55b039844098363 Mon Sep 17 00:00:00 2001
From: Katy Scott <k.l.scott16@gmail.com>
Date: Thu, 3 Aug 2023 11:09:59 -0400
Subject: [PATCH 8/9] Remove second df conversion

---
 imgtools/utils/crawl.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/imgtools/utils/crawl.py b/imgtools/utils/crawl.py
index 4578885..5ad168c 100644
--- a/imgtools/utils/crawl.py
+++ b/imgtools/utils/crawl.py
@@ -276,7 +276,7 @@ def crawl(top,
         json.dump(database_dict, f, indent=4)
     
     # save as dataframe
-    df = to_df(database_dict)
+    # df = to_df(database_dict)
     df_path = pathlib.Path(parent_imgtools, f'imgtools_{dataset}.csv').as_posix()
     df.to_csv(df_path)
     

From 56b09398fb8b0eee71de7fbbfd31a20306f12864 Mon Sep 17 00:00:00 2001
From: Katy Scott <k.l.scott16@gmail.com>
Date: Thu, 3 Aug 2023 16:15:22 -0400
Subject: [PATCH 9/9] added file name back to RTSTRUCT folder listing

---
 imgtools/utils/crawl.py | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/imgtools/utils/crawl.py b/imgtools/utils/crawl.py
index 5ad168c..6513395 100644
--- a/imgtools/utils/crawl.py
+++ b/imgtools/utils/crawl.py
@@ -16,7 +16,7 @@ def findMissingCTReference(database_df, folder):
     database_df = database_df.drop(missingRefCTs.index)
 
     for idx, rt in missingRefCTs.iterrows():
-        rt_path = os.path.join(os.path.dirname(folder), rt['file_path'])
+        rt_path = os.path.join(os.path.dirname(folder), rt['folder'])
         # Load the RTSTRUCT again
         rt_meta = dcmread(rt_path, force=True, stop_before_pixels=True)
         # Get any reference SOP Instances from the RTSTRUCT - these will be individual slices in the CT they correspond to
@@ -26,6 +26,8 @@ def findMissingCTReference(database_df, folder):
             for idx in range(len(refSOPInstances)):
                 reference_ct_list.append(refSOPInstances[idx].ReferencedSOPInstanceUID)
 
+        # reference_ct_list_sample = [i.ReferencedSOPInstanceUID for i in rt_meta.ReferencedFrameOfReferenceSequence[0].RTReferencedStudySequence[0].RTReferencedSeriesSequence[0].ContourImageSequence]
+
         # Get a new dataframe with rows for each CT reference
         updatedRTRows = pd.concat([missingRefCTs.iloc[[0]]]*len(refSOPInstances))
         updatedRTRows.reset_index(drop=True, inplace=True)  
@@ -111,6 +113,12 @@ def crawl_one(folder):
                         except:
                             pass
                 
+                # Special metadata 
+                try:
+                    reference_ct_special = [i.ReferencedSOPInstanceUID for i in rt_meta.ReferencedFrameOfReferenceSequence[0].RTReferencedStudySequence[0].RTReferencedSeriesSequence[0].ContourImageSequence]
+                except:
+                    pass
+
                 #MRI Tags
                 try:
                     tr = float(meta.RepetitionTime)
@@ -166,8 +174,8 @@ def crawl_one(folder):
                     database[patient][study] = {'description': study_description}
                 if series not in database[patient][study]:
                     rel_crawl_path  = rel_posix
-                    # if meta.Modality == 'RTSTRUCT':
-                    #     rel_crawl_path = os.path.join(rel_crawl_path, fname)
+                    if meta.Modality == 'RTSTRUCT':
+                        rel_crawl_path = os.path.join(rel_crawl_path, fname)
                     
                     database[patient][study][series] = {'description': series_description}
                 if subseries not in database[patient][study][series]:
@@ -188,6 +196,7 @@ def crawl_one(folder):
                                                                    'imaged_nucleus': elem,
                                                                    'fname': rel_path.as_posix() #temporary until we switch to json-based loading
                                                                    }
+
                     # If there are multiple CTs referenced for this segmentation, make an RTSTRUCT instance/row for each CT ID as different acquisition/subseries (name pending)
                     if isinstance(reference_ct, list):
                         database[patient][study][series]["default"]["reference_ct"] = reference_ct[0]