innolitics · phcerdan · Dec 14, 2023 · Dec 14, 2023
diff --git a/README.md b/README.md
@@ -4,7 +4,7 @@ Utility functions for MD.ai. Download and upload 2D and 3D segmentation images.
 
 ## Download data
 
-- Download all data, dicoms and annotations.
+- Download all data: dicoms and annotations.
 
 ```bash
 python -m mdai_utils.download_annotations \
@@ -13,14 +13,54 @@ python -m mdai_utils.download_annotations \
 --download_dicoms
 ```
 
-Once dicoms are downloaded locally, just download annotations. The annotations
-coming from MD.ai are json files. They will be stored in `./data`:
+Dicoms will be downloaded with the default md.ai structure:
+
+```md
+- data/mdai_uab_project_L1NprBvP_images_dataset_D_odXMLm_2023-12-13-152228/
+    - 1.2.826.0.1.3680043.2.1125.1.75064541463040.2005072610384286421/
+      - 1.2.826.0.1.3680043.2.1125.1.75064541463040.2005072610384286453/
+            1.2.826.0.1.3680043.2.1125.1.75064541463040.2005072610414630768.dcm
+            1.2.826.0.1.3680043.2.1125.1.75064541463040.2005072610414645741.dcm
+            1.2.826.0.1.3680043.2.1125.1.75064541463040.2005072610414662833.dcm
+            1.2.826.0.1.3680043.2.1125.1.75064541463040.2005072610414677861.dcm
+            1.2.826.0.1.3680043.2.1125.1.75064541463040.2005072610414694890.dcm
+```
 
-```bash
-python -m mdai_utils.download_annotations \
- --parameters myparameters.json \
- -o ./data
+If the option `--create_volumes` is added (to cli or the parameters file), a 3D
+image will be generated in parallel to the `.dcm` files:
+
+```md
+            image.nii.gz
+            volume_metadata.json
+```
+
+The annotations/segmentations from md.ai are stored in json file.
+
+```md
+./data/mdai_uab_project_L1NprBvP_annotations_dataset_D_odXMLm_labelgroup_G_2Jy2yZ_2023-12-13-152213.json
 ```
+and the masks/images are stored in a folder:
+
+```md
+./data/mdai_uab_project_L1NprBvP_annotations_dataset_D_odXMLm_labelgroup_G_2Jy2yZ_2023-12-13-152213_segmentations_2023-12-14-114011/
+```
+
+with structure:
+
+```md
+mylabel__1.2.826.0.1.3680043.2.1125.1.75064541463040.2005072610384286421__1.2.826.0.1.3680043.2.1125.1.75064541463040.2005072610384286453__1.2.826.0.1.3680043.2.1125.1.75064541463040.2005072610414630768.nii.gz
+mylabel__1.2.826.0.1.3680043.2.1125.1.75064541463040.2005072610384286421__1.2.826.0.1.3680043.2.1125.1.75064541463040.2005072610384286453__1.2.826.0.1.3680043.2.1125.1.75064541463040.2005072610414645741.nii.gz
+mylabel__1.2.826.0.1.3680043.2.1125.1.75064541463040.2005072610384286421__1.2.826.0.1.3680043.2.1125.1.75064541463040.2005072610384286453__1.2.826.0.1.3680043.2.1125.1.75064541463040.2005072610414662833.nii.gz
+mylabel__1.2.826.0.1.3680043.2.1125.1.75064541463040.2005072610384286421__1.2.826.0.1.3680043.2.1125.1.75064541463040.2005072610384286453__1.2.826.0.1.3680043.2.1125.1.75064541463040.2005072610414677861.nii.gz
+mylabel__1.2.826.0.1.3680043.2.1125.1.75064541463040.2005072610384286421__1.2.826.0.1.3680043.2.1125.1.75064541463040.2005072610384286453__1.2.826.0.1.3680043.2.1125.1.75064541463040.2005072610414694890.nii.gz
+pair_data.json
+volumes/ # Only generated with --create_volumes option
+```
+
+---
+
+Once dicoms are downloaded locally, and they are not changed in md.ai, do not
+pass the --download_dicoms option to avoid re-downloading them.
 
 ## Upload 2D segmentations
 

diff --git a/mdai_utils/download_annotations.py b/mdai_utils/download_annotations.py
@@ -143,6 +143,190 @@ def match_folder_to_json_file(
     return Path(input_path) / folder_match[0] if folder_match else None
 
 
+def get_dicom_names_ordered_and_metadata(dicom_dir):
+    """
+    Read the dicom files in a folder.
+    Return a list of tuples (filename, sop_instance_uid) ordered by z position.
+    """
+    io = itk.GDCMSeriesFileNames.New()  # pyright: ignore[reportGeneralTypeIssues]
+    io.AddSeriesRestriction("0008|0021")
+    io.SetUseSeriesDetails(True)
+    io.SetDirectory(str(dicom_dir))
+    io.Update()
+
+    series_uids = io.GetSeriesUIDs()
+
+    if len(series_uids) == 0:
+        raise ValueError(f"Found no series in folder: {dicom_dir}")
+    elif len(series_uids) > 1:
+        raise ValueError(f"Found more than one series in the same folder: {dicom_dir}")
+    series_id = series_uids[0]
+    # Ordered by z position
+    dicom_names_ordered = io.GetFileNames(series_id)
+    # Get also SOPInstanceUID for each file
+
+    # Create an instance of GDCMImageIO
+    gdcm_io = itk.GDCMImageIO.New()  # pyright: ignore[reportGeneralTypeIssues]
+    gdcm_io.LoadPrivateTagsOn()
+
+    # Get SOPInstanceUID for each file
+    dicom_names_with_uid_ordered = []
+    metadict_volume = {}
+    for i, dicom_name in enumerate(dicom_names_ordered):
+        gdcm_io.SetFileName(dicom_name)
+        gdcm_io.ReadImageInformation()
+        if i == 0:
+            reader = (
+                itk.ImageFileReader.New(  # pyright: ignore[reportGeneralTypeIssues]
+                    FileName=dicom_names_ordered[0], ImageIO=gdcm_io
+                )
+            )
+            reader.Update()
+            metadict = reader.GetMetaDataDictionary()
+            tagkeys = metadict.GetKeys()
+            for tagkey in tagkeys:
+                if (
+                    tagkey == "ITK_original_direction"
+                    or tagkey == "ITK_original_spacing"
+                ):
+                    continue
+                tagvalue = metadict[tagkey]
+                metadict_volume[tagkey] = tagvalue
+
+        _, sop_instance_uid = gdcm_io.GetValueFromTag(
+            "0008|0018", ""
+        )  # Tag for SOPInstanceUID
+        dicom_names_with_uid_ordered.append((dicom_name, sop_instance_uid))
+
+    # Add to the metadict_volume a new key: "slices", a dict with key SOPInstanceUID
+    # And values:  a new dict with index, and filename
+    slices_dict = {}
+    for i, (dicom_name, sop_instance_uid) in enumerate(dicom_names_with_uid_ordered):
+        slices_dict[sop_instance_uid] = {"index": i, "filename": str(dicom_name)}
+    metadict_volume["slices"] = slices_dict
+
+    return dicom_names_with_uid_ordered, metadict_volume
+
+
+def merge_slices_into3D(
+    pair_data_json_file, labels, output_path_mask_3d, output_path_grayscale_3d=None
+):
+    """
+    PRECONDITION: pair_data_json_file contains a list of slices of paired data:
+        image (dicom) and label (mask in nifti format)
+    We want to read all dicoms as a 3D volume, and merge all nifti masks into a single 3D mask,
+    with the same shape as the dicom volume.
+    """
+
+    with open(pair_data_json_file) as f:
+        pair_data = json.load(f)
+
+    if len(pair_data) == 0:
+        raise ValueError(f"pair_data_json_file is empty: {pair_data_json_file}")
+
+    # We want to group all the slices sharing same StudyInstanceUID and SeriesInstanceUID
+    pair_data_grouped = {}
+    for slice in pair_data:
+        hash_id = slice["study__series__sop_ids"]
+        study_id, series_id, sop_id = hash_id.split("__")
+        if study_id not in pair_data_grouped:
+            pair_data_grouped[study_id] = {}
+        if series_id not in pair_data_grouped[study_id]:
+            pair_data_grouped[study_id][series_id] = []
+        slice_with_sop_id = slice.copy()
+        slice_with_sop_id["sop_id"] = sop_id
+        pair_data_grouped[study_id][series_id].append(slice_with_sop_id)
+
+    # About the ordering...
+    # We are going to read the original dicom files with GDCM via itk
+    for study_id, series_dict in pair_data_grouped.items():
+        for series_id, slices in series_dict.items():
+            # Read the dicom files
+            dicom_dir = Path(slices[0]["image"]).parent
+            (
+                dicom_names_with_uid_ordered,
+                metadict_volume,
+            ) = get_dicom_names_ordered_and_metadata(dicom_dir)
+
+            # Read the dicom files with itk SeriesReader
+            filenames_ordered = [name for name, _ in dicom_names_with_uid_ordered]
+            reader = (
+                itk.ImageSeriesReader.New(  # pyright: ignore[reportGeneralTypeIssues]
+                    FileNames=filenames_ordered, ForceOrthogonalDirection=False
+                )
+            )
+            reader.Update()
+            dicom_volume = reader.GetOutput()
+            if output_path_grayscale_3d is not None:
+                output_grayscale_parent_folder = (
+                    Path(output_path_grayscale_3d) / study_id / series_id
+                )
+                output_grayscale_parent_folder.mkdir(parents=True, exist_ok=True)
+                output_dicom_volume_path = (
+                    output_grayscale_parent_folder / "image.nii.gz"
+                )
+                itk.imwrite(dicom_volume, output_dicom_volume_path)
+                # Save the metadata
+                metadict_volume_output_path = (
+                    output_grayscale_parent_folder / "volume_metadata.json"
+                )
+                with open(metadict_volume_output_path, "w") as f:
+                    json.dump(metadict_volume, f, indent=2)
+
+            # Ok, now we have the dicom volume and the mapping for ordered SOPInstanceUID
+            # We now need to create a 3D mask with the same shape as the dicom volume
+            # And populate it with the masks from the slices, after ordering them using the SOPInstanceUID order from the dicom_named_ordered
+
+            ordered_sop_ids = [sop_id for _, sop_id in dicom_names_with_uid_ordered]
+
+            # Create the 3D mask
+            output_mask_parent_folder = Path(output_path_mask_3d) / study_id / series_id
+            output_mask_parent_folder.mkdir(parents=True, exist_ok=True)
+            for label in labels:
+                label_volume_np = np.zeros(dicom_volume.shape, dtype=np.uint8)
+                for slice in slices:
+                    # Read the mask
+                    mask_path = slice.get(label)
+                    if mask_path is None:
+                        raise ValueError(
+                            f"Could not find label {label} in slice {slice}"
+                        )
+                    label_image = itk.imread(mask_path)
+                    # Assert image is 3D.
+                    if label_image.GetImageDimension() != 3:
+                        raise ValueError(
+                            f"Expected that the mask is a 3D image, with Z=1 slice, found {label_image.GetImageDimension()}"
+                        )
+                    # Get the SOPInstanceUID
+                    hash_id = slice["study__series__sop_ids"]
+                    _, _, sop_id = hash_id.split("__")
+                    # Get the index of the SOPInstanceUID in the ordered list
+                    sop_index = ordered_sop_ids.index(sop_id)
+                    label_image_np = itk.array_from_image(
+                        label_image
+                    )  # The label is Z=1, Y, X
+                    # Populate the 3D mask slice
+                    label_volume_np[sop_index] = label_image_np.squeeze()
+
+                # Convert to itk image
+                label_volume = itk.image_from_array(label_volume_np)
+                label_volume.SetOrigin(dicom_volume.GetOrigin())
+                label_volume.SetSpacing(dicom_volume.GetSpacing())
+                label_volume.SetDirection(dicom_volume.GetDirection())
+
+                # Check label and image have the same shape
+                label_volume_size = np.array(label_volume.shape)
+                image_volume_size = np.array(dicom_volume.shape)
+                if not np.all(label_volume_size == image_volume_size):
+                    raise ValueError(
+                        f"Label and image have different shapes: {label_volume_size} != {image_volume_size}"
+                    )
+
+                # Write the label volume
+                output_mask_path = output_mask_parent_folder / f"{label}.nii.gz"
+                itk.imwrite(label_volume, output_mask_path)
+
+
 def main(args):
     print(args)
 
@@ -172,6 +356,7 @@ def main(args):
         "mdai_no_fixing_metadata", False
     )
     labels = args.labels or parameters.get("labels", [])
+    create_volumes = args.create_volumes or parameters.get("create_volumes", False)
 
     mdai_label_ids = bidict(parameters.get("mdai_label_ids", {}))
     if labels:
@@ -246,6 +431,7 @@ def main(args):
         sep = "__"
         hash_id = sep.join(hash_)
         pair_data_entry = {}
+        pair_data_entry["study__series__sop_ids"] = hash_id
 
         raw_slice_image = None
         if not no_fixing_metadata:
@@ -307,6 +493,20 @@ def main(args):
 
     print(f"pair_data_folder: {labels_parent_folder}")
 
+    if create_volumes:
+        pair_data_json_file = Path(labels_parent_folder) / "pair_data.json"
+        out_path_masks_3d = Path(labels_parent_folder) / "volumes"
+        out_path_masks_3d.mkdir(parents=True, exist_ok=True)
+        # Create the volumes in the original data downloaded by md.ai
+        # The structure is {study_id}/{series_id}/image.dcm.
+        # We add the 3d nifti volume and the metadata in that folder.
+        out_path_grayscale_3d = None
+        if not no_fixing_metadata and download_dicoms:
+            out_path_grayscale_3d = Path(match_folder) if match_folder else None
+        merge_slices_into3D(
+            pair_data_json_file, labels, out_path_masks_3d, out_path_grayscale_3d
+        )
+
 
 def get_download_parser():
     import argparse
@@ -371,6 +571,12 @@ def get_download_parser():
         help="Labels to process. If none is set, all labels found will be processed.",
     )
 
+    parser.add_argument(
+        "--create_volumes",
+        action="store_true",
+        help="The dataset is 3D, and we want to merge all slices and dicoms into 3D volumes.",
+    )
+
     parser.add_argument(
         "--parameters",
         type=str,

diff --git a/tests/test_parameters.json b/tests/test_parameters.json
@@ -11,5 +11,6 @@
     "mdai_no_fixing_metadata": false,
     "labels": [
         "mylabel"
-    ]
+    ],
+    "create_volumes": true
 }