From 6b7b41cb191fdf4b4222284156f06ff80a2a5622 Mon Sep 17 00:00:00 2001
From: Andrey Fedorov <andrey.fedorov@gmail.com>
Date: Fri, 2 Aug 2024 17:50:18 -0400
Subject: [PATCH 01/13] STYLE: remove DF conversion function for the index
 overview

In the future, we can add convenience conversion util function that
could serve same purpose without being attached to a specific class
variable

parent aa021d5b05579f2665dec2212c26aed4b92fc56a
author ds_93 <daniela.schacherer@web.de> 1723195062 +0200
committer ds_93 <daniela.schacherer@web.de> 1726760294 +0200

parent aa021d5b05579f2665dec2212c26aed4b92fc56a
author ds_93 <daniela.schacherer@web.de> 1723195062 +0200
committer ds_93 <daniela.schacherer@web.de> 1726760288 +0200

BUG: fixed codespell complaint

BUG: fixed pylint errors

BUG: fixed pylint errors

BUG: fixed test

ENH: added printing configurations for codespell

added download size calculation for single instance download. Changed tqdm bar data to be displayed in GB/MB instead of bytes

ENH: enable downloading data in manifests from previous idc versions

ENH: add description for the previous versions index

ENH: fix error messages for items not identified in the current version

ENH: add clinical_index

also added checks for existence of the URLs containing remote indices

BUG: use trim to remove any extraneous spaces while parsing s3 url in manifest

ENH: simplify s3_url extraction

update to simplify to use clinical_index from idc-index

clarify wording of a section

BUG: remove notebook comitted by accident from Colab

BUG: fix viewer series selection parameter

SeriesInstanceUID changed to SeriesInstanceUIDs, see
https://docs.ohif.org/migration-guide/from-3p7-to-3p8#studyinstanceuid-in-the-url-param

ENH: simplify download and create destination directory if needed

Automatic creation of the destination directory mimics the behavior
of s5cmd and simplifies usage

DOC: fix explanation of the dirTemplate parameter

fixed bug

fixed bug second try

wip

ENH: update to IDC v19

ENH: upgrade idc-index-data and use query-based clinical_index

removed hardcoded idc-index-data version
---
 .pre-commit-config.yaml |   1 +
 idc_index/cli.py        |   4 +-
 idc_index/index.py      | 409 +++++++++++++++++++++++++++++++---------
 pyproject.toml          |   2 +-
 tests/idcindex.py       |  35 +++-
 5 files changed, 352 insertions(+), 99 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index a8ce7587..b1b5f058 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -60,6 +60,7 @@ repos:
     rev: "v2.2.6"
     hooks:
       - id: codespell
+        args: ["--quiet-level 3"]
 
   - repo: https://github.com/shellcheck-py/shellcheck-py
     rev: "v0.9.0.6"
diff --git a/idc_index/cli.py b/idc_index/cli.py
index 09fed46c..3171276b 100644
--- a/idc_index/cli.py
+++ b/idc_index/cli.py
@@ -113,7 +113,7 @@ def set_log_level(log_level):
     "--dir-template",
     type=str,
     default=IDCClient.DOWNLOAD_HIERARCHY_DEFAULT,
-    help="Download directory hierarchy template. This variable defines the folder hierarchy for the organizing the downloaded files in downloadDirectory. Defaults to index.DOWNLOAD_HIERARCHY_DEFAULT set to %collection_id/%PatientID/%StudyInstanceUID/%Modality_%SeriesInstanceUID. The template string can be built using a combination of selected metadata attributes (PatientID, collection_id, Modality, StudyInstanceUID, SeriesInstanceUID) that must be prefixed by '%'. The following special characters can be used as separators: '-' (hyphen), '/' (slash for subdirectories), '_' (underscore). When set to None all files will be downloaded to the download directory with no subdirectories.",
+    help="Download directory hierarchy template. This variable defines the folder hierarchy for the organizing the downloaded files in downloadDirectory. Defaults to index.DOWNLOAD_HIERARCHY_DEFAULT set to %collection_id/%PatientID/%StudyInstanceUID/%Modality_%SeriesInstanceUID. The template string can be built using a combination of selected metadata attributes (PatientID, collection_id, Modality, StudyInstanceUID, SeriesInstanceUID) that must be prefixed by '%'. The following special characters can be used as separators: '-' (hyphen), '/' (slash for subdirectories), '_' (underscore). When set to empty string (\"\") all files will be downloaded to the download directory with no subdirectories.",
 )
 def download_from_selection(
     download_dir,
@@ -233,7 +233,7 @@ def download_from_selection(
     "--dir-template",
     type=str,
     default=IDCClient.DOWNLOAD_HIERARCHY_DEFAULT,
-    help="Download directory hierarchy template. This variable defines the folder hierarchy for the organizing the downloaded files in downloadDirectory. Defaults to index.DOWNLOAD_HIERARCHY_DEFAULT set to %collection_id/%PatientID/%StudyInstanceUID/%Modality_%SeriesInstanceUID. The template string can be built using a combination of selected metadata attributes (PatientID, collection_id, Modality, StudyInstanceUID, SeriesInstanceUID) that must be prefixed by '%'. The following special characters can be used as separators: '-' (hyphen), '/' (slash for subdirectories), '_' (underscore). When set to None all files will be downloaded to the download directory with no subdirectories.",
+    help="Download directory hierarchy template. This variable defines the folder hierarchy for the organizing the downloaded files in downloadDirectory. Defaults to index.DOWNLOAD_HIERARCHY_DEFAULT set to %collection_id/%PatientID/%StudyInstanceUID/%Modality_%SeriesInstanceUID. The template string can be built using a combination of selected metadata attributes (PatientID, collection_id, Modality, StudyInstanceUID, SeriesInstanceUID) that must be prefixed by '%'. The following special characters can be used as separators: '-' (hyphen), '/' (slash for subdirectories), '_' (underscore). When set to empty string (\"\") all files will be downloaded to the download directory with no subdirectories.",
 )
 def download_from_manifest(
     manifest_file,
diff --git a/idc_index/index.py b/idc_index/index.py
index 5cc76be8..fbbe7694 100644
--- a/idc_index/index.py
+++ b/idc_index/index.py
@@ -41,7 +41,7 @@ class IDCClient:
     CITATION_FORMAT_BIBTEX = "application/x-bibtex"
 
     # Singleton pattern
-    # NOTE: In the future, one may want to use multiple clients e.g. for sub-datasets so a attribute-singleton as shown bewlo seems a better option.
+    # NOTE: In the future, one may want to use multiple clients e.g. for sub-datasets so a attribute-singleton as shown below seems a better option.
     # _instance: IDCClient
     # def __new__(cls):
     #     if not hasattr(cls, "_instance") or getattr(cls, "_instance") is None:
@@ -63,18 +63,30 @@ def __init__(self):
         file_path = idc_index_data.IDC_INDEX_PARQUET_FILEPATH
         logger.debug(f"Reading index file v{idc_index_data.__version__}")
         self.index = pd.read_parquet(file_path)
+
+        self.previous_versions_index_path = (
+            idc_index_data.PRIOR_VERSIONS_INDEX_PARQUET_FILEPATH
+        )
+
         # self.index = self.index.astype(str).replace("nan", "")
         self.index["series_size_MB"] = self.index["series_size_MB"].astype(float)
         self.collection_summary = self.index.groupby("collection_id").agg(
             {"Modality": pd.Series.unique, "series_size_MB": "sum"}
         )
 
+        idc_version = f"v{Version(idc_index_data.__version__).major}"
+
         self.indices_overview = {
             "index": {
                 "description": "Main index containing one row per DICOM series.",
                 "installed": True,
                 "url": None,
             },
+            "previous_versions_index": {
+                "description": "index containing one row per DICOM series from all previous IDC versions that are not in current version.",
+                "installed": True,
+                "url": None,
+            },
             "sm_index": {
                 "description": "DICOM Slide Microscopy series-level index.",
                 "installed": False,
@@ -85,6 +97,11 @@ def __init__(self):
                 "installed": False,
                 "url": f"{asset_endpoint_url}/sm_instance_index.parquet",
             },
+            "clinical_index": {
+                "description": "Index of clinical data accompanying the available images.",
+                "installed": False,
+                "url": f"{asset_endpoint_url}/clinical_index.parquet",
+            },
         }
 
         # Lookup s5cmd
@@ -119,7 +136,12 @@ def _filter_dataframe_by_id(key, dataframe, _id):
 
     @staticmethod
     def _safe_filter_by_selection(
-        df_index, collection_id, patientId, studyInstanceUID, seriesInstanceUID
+        df_index,
+        collection_id,
+        patientId,
+        studyInstanceUID,
+        seriesInstanceUID,
+        sopInstanceUID=None,
     ):
         if collection_id is not None:
             if not isinstance(collection_id, str) and not isinstance(
@@ -139,6 +161,11 @@ def _safe_filter_by_selection(
                 seriesInstanceUID, list
             ):
                 raise TypeError("seriesInstanceUID must be a string or list of strings")
+        if sopInstanceUID is not None:
+            if not isinstance(sopInstanceUID, str) and not isinstance(
+                sopInstanceUID, list
+            ):
+                raise TypeError("sopInstanceUID must be a string or list of strings")
 
         if collection_id is not None:
             result_df = IDCClient._filter_by_collection_id(df_index, collection_id)
@@ -158,6 +185,11 @@ def _safe_filter_by_selection(
                 result_df, seriesInstanceUID
             )
 
+        if sopInstanceUID is not None:
+            result_df = IDCClient._filter_by_dicom_instance_uid(
+                result_df, sopInstanceUID
+            )
+
         return result_df
 
     @staticmethod
@@ -182,6 +214,12 @@ def _filter_by_dicom_series_uid(df_index, dicom_series_uid):
             "SeriesInstanceUID", df_index, dicom_series_uid
         )
 
+    @staticmethod
+    def _filter_by_dicom_instance_uid(df_index, dicom_instance_uid):
+        return IDCClient._filter_dataframe_by_id(
+            "SOPInstanceUID", df_index, dicom_instance_uid
+        )
+
     @staticmethod
     def get_idc_version():
         """
@@ -190,19 +228,19 @@ def get_idc_version():
         idc_version = Version(idc_index_data.__version__).major
         return f"v{idc_version}"
 
-    def list_indices(self):
+    @staticmethod
+    def _check_create_directory(download_dir):
         """
-        Lists all available indices including their installation status.
-
-        Returns:
-            indices_overview (pd.DataFrame): DataFrame containing information per index.
+        Mimic behavior of s5cmd and create the download directory if it does not exist
         """
+        download_dir = Path(download_dir)
+        download_dir.mkdir(parents=True, exist_ok=True)
 
-        return pd.DataFrame.from_dict(self.indices_overview, orient="index")
+        return str(download_dir.resolve())
 
     def fetch_index(self, index) -> None:
         """
-        Downloads requested index.
+        Downloads requested index and adds this index joined with the main index as respective class attribute.
 
         Args:
             index (str): Name of the index to be downloaded.
@@ -221,10 +259,19 @@ def fetch_index(self, index) -> None:
                     idc_index_data.IDC_INDEX_PARQUET_FILEPATH.parents[0],
                     f"{index}.parquet",
                 )
+
                 with open(filepath, mode="wb") as file:
                     file.write(response.content)
-                setattr(self.__class__, index, pd.read_parquet(filepath))
+
+                # Join new index with main index
+                sm_instance_index = pd.read_parquet(filepath)
+                sm_instance_index = sm_instance_index.merge(
+                    self.index, on="SeriesInstanceUID", how="left"
+                )
+
+                setattr(self.__class__, index, sm_instance_index)
                 self.indices_overview[index]["installed"] = True
+
             else:
                 logger.error(
                     f"Failed to fetch index from URL {self.indices_overview[index]['url']}: {response.status_code}"
@@ -501,7 +548,7 @@ def get_viewer_URL(
                 ohif_v2, ohif_v3, or slim. If not provided, default viewers will be used.
 
         Returns:
-            string containing the IDC viewer URL for the given SeriesInstanceUID
+            string containing the IDC viewer URL for the requested selection
         """
 
         if seriesInstanceUID is None and studyInstanceUID is None:
@@ -574,7 +621,7 @@ def get_viewer_URL(
             if seriesInstanceUID is None:
                 viewer_url = f"https://viewer.imaging.datacommons.cancer.gov/v3/viewer/?StudyInstanceUIDs={studyInstanceUID}"
             else:
-                viewer_url = f"https://viewer.imaging.datacommons.cancer.gov/v3/viewer/?StudyInstanceUIDs={studyInstanceUID}&SeriesInstanceUID={seriesInstanceUID}"
+                viewer_url = f"https://viewer.imaging.datacommons.cancer.gov/v3/viewer/?StudyInstanceUIDs={studyInstanceUID}&SeriesInstanceUIDs={seriesInstanceUID}"
         elif viewer_selector == "volview":
             # TODO! Not implemented yet
             viewer_url = None
@@ -627,6 +674,14 @@ def _validate_update_manifest_and_get_download_size(
         # create a copy of the index
         index_df_copy = self.index
 
+        # use default hierarchy
+        if dirTemplate is not None:
+            hierarchy = self._generate_sql_concat_for_building_directory(
+                dirTemplate=dirTemplate, downloadDir=downloadDir
+            )
+        else:
+            hierarchy = "NULL"
+
         # Extract s3 url and crdc_series_uuid from the manifest copy commands
         # Next, extract crdc_series_uuid from aws_series_url in the index and
         # try to verify if every series in the manifest is present in the index
@@ -634,7 +689,7 @@ def _validate_update_manifest_and_get_download_size(
         # TODO: need to remove the assumption that manifest commands will have 'cp'
         #  and need to parse S3 URL directly
         # ruff: noqa
-        sql = """
+        sql = f"""
             PRAGMA disable_progress_bar;
             WITH
             index_temp AS (
@@ -642,19 +697,21 @@ def _validate_update_manifest_and_get_download_size(
                 seriesInstanceUID,
                 series_aws_url,
                 series_size_MB,
-                REGEXP_EXTRACT(series_aws_url, '(?:.*?\\/){3}([^\\/?#]+)', 1) index_crdc_series_uuid
+                {hierarchy} AS path,
+                REGEXP_EXTRACT(series_aws_url, '(?:.*?\\/){{3}}([^\\/?#]+)', 1) index_crdc_series_uuid
             FROM
                 index_df_copy),
             manifest_temp AS (
             SELECT
                 manifest_cp_cmd,
-                REGEXP_EXTRACT(manifest_cp_cmd, '(?:.*?\\/){3}([^\\/?#]+)', 1) AS manifest_crdc_series_uuid,
-                REGEXP_REPLACE(regexp_replace(manifest_cp_cmd, 'cp ', ''), '\\s[^\\s]*$', '') AS s3_url,
+                REGEXP_EXTRACT(manifest_cp_cmd, '(?:.*?\\/){{3}}([^\\/?#]+)', 1) AS manifest_crdc_series_uuid,
+                REGEXP_EXTRACT(manifest_cp_cmd, 's3://\\S+') AS s3_url,
             FROM
                 manifest_df )
             SELECT
                 seriesInstanceuid,
                 s3_url,
+                path,
                 series_size_MB,
                 index_crdc_series_uuid is not NULL as crdc_series_uuid_match,
                 s3_url==series_aws_url AS s3_url_match,
@@ -682,12 +739,83 @@ def _validate_update_manifest_and_get_download_size(
             missing_manifest_cp_cmds = merged_df.loc[
                 ~merged_df["crdc_series_uuid_match"], "manifest_cp_cmd"
             ]
-            logger.error(
-                "The following manifest copy commands are not recognized as referencing any associated series in the index.\n"
+            missing_in_main_cnt = len(missing_manifest_cp_cmds.tolist())
+            logger.warning(
+                f"The total of {missing_in_main_cnt} copy commands are not recognized as referencing any associated series in the main index.\n"
                 "This means either these commands are invalid, or they may correspond to files available in a release of IDC\n"
-                f"different from {self.get_idc_version()} used in this version of idc-index. The corresponding files will not be downloaded.\n"
+                f"different from {self.get_idc_version()} used in this version of idc-index. Prior data releases will be checked next."
             )
-            logger.error("\n" + "\n".join(missing_manifest_cp_cmds.tolist()))
+
+            logger.debug(
+                "Checking if the requested data is available in other idc versions "
+            )
+            missing_series_sql = f"""
+            PRAGMA disable_progress_bar;
+            WITH
+            combined_index AS
+            (SELECT
+                *,
+                {hierarchy} AS path,
+            FROM
+                index_df_copy
+            union by name
+            SELECT
+                *,
+                 {hierarchy} AS path,
+            FROM
+                '{self.previous_versions_index_path}' pvip
+
+            ),
+            index_temp AS (
+            SELECT
+                seriesInstanceUID,
+                series_aws_url,
+                series_size_MB,
+                path,
+                REGEXP_EXTRACT(series_aws_url, '(?:.*?\\/){{3}}([^\\/?#]+)', 1) index_crdc_series_uuid
+            FROM
+                combined_index),
+            manifest_temp AS (
+            SELECT
+                manifest_cp_cmd,
+                REGEXP_EXTRACT(manifest_cp_cmd, '(?:.*?\\/){{3}}([^\\/?#]+)', 1) AS manifest_crdc_series_uuid,
+                REGEXP_REPLACE(regexp_replace(manifest_cp_cmd, 'cp ', ''), '\\s[^\\s]*$', '') AS s3_url,
+            FROM
+                manifest_df )
+            SELECT
+                seriesInstanceuid,
+                s3_url,
+                path,
+                series_size_MB,
+                index_crdc_series_uuid is not NULL as crdc_series_uuid_match,
+                TRIM(s3_url) = TRIM(series_aws_url) AS s3_url_match,
+                manifest_temp.manifest_cp_cmd,
+            CASE
+                WHEN TRIM(s3_url) = TRIM(series_aws_url) THEN 'aws'
+            ELSE
+                'unknown'
+            END
+                AS endpoint
+            FROM
+                manifest_temp
+            LEFT JOIN
+                index_temp
+            ON
+                index_temp.index_crdc_series_uuid = manifest_temp.manifest_crdc_series_uuid
+            """
+            merged_df = duckdb.query(missing_series_sql).df()
+            if not all(merged_df["crdc_series_uuid_match"]):
+                missing_manifest_cp_cmds = merged_df.loc[
+                    ~merged_df["crdc_series_uuid_match"], "manifest_cp_cmd"
+                ]
+                logger.error(
+                    "The following manifest copy commands are not recognized as referencing any associated series in any release of IDC.\n"
+                    "This means either these commands are invalid. Please submit an issue on https://github.com/ImagingDataCommons/idc-index/issues \n"
+                    "The corresponding files could not be downloaded.\n"
+                )
+                logger.error("\n" + "\n".join(missing_manifest_cp_cmds.tolist()))
+            else:
+                logger.info("All of the identifiers from manifest have been resolved!")
 
         if validate_manifest:
             # Check if there is more than one endpoint
@@ -738,29 +866,29 @@ def _validate_update_manifest_and_get_download_size(
         total_size = merged_df["series_size_MB"].sum()
         total_size = round(total_size, 2)
 
-        if dirTemplate is not None:
-            hierarchy = self._generate_sql_concat_for_building_directory(
-                dirTemplate=dirTemplate, downloadDir=downloadDir
-            )
-            sql = f"""
-                WITH temp as
-                    (
-                        SELECT
-                            seriesInstanceUID,
-                            s3_url
-                        FROM
-                            merged_df
-                    )
-                SELECT
-                    s3_url,
-                    {hierarchy} as path
-                FROM
-                    temp
-                JOIN
-                    index using (seriesInstanceUID)
-                """
-            logger.debug(f"About to run this query:\n{sql}")
-            merged_df = self.sql_query(sql)
+        # if dirTemplate is not None:
+        #     hierarchy = self._generate_sql_concat_for_building_directory(
+        #         dirTemplate=dirTemplate, downloadDir=downloadDir
+        #     )
+        #     sql = f"""
+        #         WITH temp as
+        #             (
+        #                 SELECT
+        #                     seriesInstanceUID,
+        #                     s3_url
+        #                 FROM
+        #                     merged_df
+        #             )
+        #         SELECT
+        #             s3_url,
+        #             {hierarchy} as path
+        #         FROM
+        #             temp
+        #         JOIN
+        #             index using (seriesInstanceUID)
+        #         """
+        #     logger.debug(f"About to run this query:\n{sql}")
+        #     merged_df = self.sql_query(sql)
         # Write a temporary manifest file
         with tempfile.NamedTemporaryFile(mode="w", delete=False) as temp_manifest_file:
             if use_s5cmd_sync and len(os.listdir(downloadDir)) != 0:
@@ -865,41 +993,40 @@ def _track_download_progress(
         runtime_errors = []
 
         if show_progress_bar:
-            total_size_bytes = size_MB * 10**6  # Convert MB to bytes
-            # temporary place holder. Accurate size is calculated in the next step
+            total_size_to_be_downloaded_bytes = size_MB * (10**6)
             initial_size_bytes = 0
             # Calculate the initial size of the directory
             for directory in list_of_directories:
                 initial_size_bytes = IDCClient._get_dir_sum_file_size(directory)
 
-            logger.info("Initial size of the directory: %s bytes", initial_size_bytes)
             logger.info(
-                "Approximate size of the files that need to be downloaded: %s bytes",
-                total_size_bytes,
+                "Initial size of the directory: %s",
+                IDCClient._format_size_bytes(initial_size_bytes),
+            )
+            logger.info(
+                "Approximate size of the files that need to be downloaded: %s",
+                IDCClient._format_size(size_MB),
             )
 
             pbar = tqdm(
-                total=total_size_bytes,
+                total=total_size_to_be_downloaded_bytes,
                 unit="B",
                 unit_scale=True,
                 desc="Downloading data",
             )
 
             while True:
+                time.sleep(0.5)
                 downloaded_bytes = 0
                 for directory in list_of_directories:
                     downloaded_bytes += IDCClient._get_dir_sum_file_size(directory)
                 downloaded_bytes -= initial_size_bytes
                 pbar.n = min(
-                    downloaded_bytes, total_size_bytes
+                    downloaded_bytes, total_size_to_be_downloaded_bytes
                 )  # Prevent the progress bar from exceeding 100%
                 pbar.refresh()
-
                 if process.poll() is not None:
                     break
-
-                time.sleep(0.5)
-
             # Wait for the process to finish
             _, stderr = process.communicate()
             pbar.close()
@@ -1133,7 +1260,7 @@ def _s5cmd_run(
                         logger.info(
                             f"Requested total download size is {total_size} MB, \
                                     however at least {existing_data_size} MB is already present,\
-                                    so downloading only remaining upto {sync_size} MB\n\
+                                    so downloading only remaining up to {sync_size} MB\n\
                                     Please note that disk sizes are calculated at series level, \
                                     so if individual files are missing, displayed progress bar may\
                                     not be accurate."
@@ -1216,6 +1343,17 @@ def _format_size(size_MB):
             return f"{round(size_GB, 2)} GB"
         return f"{round(size_MB, 2)} MB"
 
+    @staticmethod
+    def _format_size_bytes(size_bytes):
+        size_MB = size_bytes / (10**6)
+        size_GB = size_MB / 1000
+
+        if size_GB >= 1:
+            return f"{round(size_GB, 2)} GB"
+        if size_MB >= 1:
+            return f"{round(size_MB, 2)} MB"
+        return f"{round(size_bytes, 2)} bytes"
+
     def download_from_manifest(
         self,
         manifestFile: str,
@@ -1245,9 +1383,7 @@ def download_from_manifest(
             ValueError: If the download directory does not exist.
         """
 
-        downloadDir = os.path.abspath(downloadDir).replace("\\", "/")
-        if not os.path.exists(downloadDir):
-            raise ValueError("Download directory does not exist.")
+        downloadDir = self._check_create_directory(downloadDir)
 
         # validate the manifest
         (
@@ -1402,6 +1538,7 @@ def download_from_selection(
         patientId=None,
         studyInstanceUID=None,
         seriesInstanceUID=None,
+        sopInstanceUID=None,
         quiet=True,
         show_progress_bar=True,
         use_s5cmd_sync=False,
@@ -1417,6 +1554,7 @@ def download_from_selection(
             patientId: string or list of strings containing the values of PatientID to filter by
             studyInstanceUID: string or list of strings containing the values of DICOM StudyInstanceUID to filter by
             seriesInstanceUID: string or list of strings containing the values of DICOM SeriesInstanceUID to filter by
+            sopInstanceUID: string or list of strings containing the values of DICOM SOPInstanceUID to filter by
             quiet (bool): If True, suppresses the output of the subprocess. Defaults to True
             show_progress_bar (bool): If True, tracks the progress of download
             use_s5cmd_sync (bool): If True, will use s5cmd sync operation instead of cp when downloadDirectory is not empty; this can significantly improve the download speed if the content is partially downloaded
@@ -1424,24 +1562,43 @@ def download_from_selection(
 
         """
 
-        downloadDir = os.path.abspath(downloadDir).replace("\\", "/")
-        if not os.path.exists(downloadDir):
-            raise ValueError("Download directory does not exist.")
+        downloadDir = self._check_create_directory(downloadDir)
+
+        # If SOPInstanceUID(s) are given, we need to join the main index with the instance-level index
+        if sopInstanceUID:
+            if hasattr(
+                self, "sm_instance_index"
+            ):  # check if instance-level index is installed
+                index_to_be_filtered = self.sm_instance_index
+            else:
+                logger.error(
+                    "Instance-level access not possible because instance-level index not installed."
+                )
+                raise ValueError(
+                    "Instance-level access not possible because instance-level index not installed."
+                )
+        else:
+            index_to_be_filtered = self.index
 
         result_df = self._safe_filter_by_selection(
-            self.index,
+            index_to_be_filtered,
             collection_id=collection_id,
             patientId=patientId,
             studyInstanceUID=studyInstanceUID,
             seriesInstanceUID=seriesInstanceUID,
+            sopInstanceUID=sopInstanceUID,
         )
 
-        total_size = round(result_df["series_size_MB"].sum(), 2)
+        if not sopInstanceUID:
+            total_size = round(result_df["series_size_MB"].sum(), 2)
+        else:
+            total_size_bytes = round(result_df["instance_size"].sum(), 2)
+            total_size = total_size_bytes / (10**6)  # in MB
         logger.info("Total size of files to download: " + self._format_size(total_size))
         logger.info(
             "Total free space on disk: "
             + str(psutil.disk_usage(downloadDir).free / (1000 * 1000 * 1000))
-            + "GB"
+            + " GB"
         )
 
         if dry_run:
@@ -1455,51 +1612,76 @@ def download_from_selection(
                 downloadDir=downloadDir,
                 dirTemplate=dirTemplate,
             )
-            sql = f"""
-                WITH temp as
-                    (
-                        SELECT
-                            seriesInstanceUID
-                        FROM
-                            result_df
-                    )
-                SELECT
-                    series_aws_url,
-                    {hierarchy} as path
-                FROM
-                    temp
-                JOIN
-                    index using (seriesInstanceUID)
-                """
+            if sopInstanceUID:
+                sql = f"""
+                    WITH temp as
+                        (
+                            SELECT
+                                sopInstanceUID
+                            FROM
+                                result_df
+                        )
+                    SELECT
+                        CONCAT(TRIM('*' FROM series_aws_url), crdc_instance_uuid, '.dcm') as instance_url,
+                        CONCAT({hierarchy}, '/') as path
+                    FROM
+                        temp
+                    JOIN
+                        sm_instance_index using (sopInstanceUID)
+                    """
+            else:
+                sql = f"""
+                    WITH temp as
+                        (
+                            SELECT
+                                seriesInstanceUID
+                            FROM
+                                result_df
+                        )
+                    SELECT
+                        series_aws_url,
+                        {hierarchy} as path
+                    FROM
+                        temp
+                    JOIN
+                        index using (seriesInstanceUID)
+                    """
             result_df = self.sql_query(sql)
-            # Download the files
-            # make temporary file to store the list of files to download
+            # Download the files and make temporary file to store the list of files to download
+
         with tempfile.NamedTemporaryFile(mode="w", delete=False) as manifest_file:
+            # Determine column containing the URL for instance / series-level access
+            if sopInstanceUID:
+                if not "instance_url" in result_df:
+                    result_df["instance_url"] = (
+                        result_df["series_aws_url"].replace("/*", "/")
+                        + result_df["crdc_instance_uuid"]
+                        + ".dcm"
+                    )
+                url_column = "instance_url"
+            else:
+                url_column = "series_aws_url"
+
             if use_s5cmd_sync and len(os.listdir(downloadDir)) != 0:
                 if dirTemplate is not None:
                     result_df["s5cmd_cmd"] = (
-                        "sync "
-                        + result_df["series_aws_url"]
-                        + ' "'
-                        + result_df["path"]
-                        + '"'
+                        "sync " + result_df[url_column] + ' "' + result_df["path"] + '"'
                     )
                 else:
                     result_df["s5cmd_cmd"] = (
-                        "sync " + result_df["series_aws_url"] + ' "' + downloadDir + '"'
+                        "sync " + result_df[url_column] + ' "' + downloadDir + '"'
                     )
             elif dirTemplate is not None:
                 result_df["s5cmd_cmd"] = (
-                    "cp " + result_df["series_aws_url"] + ' "' + result_df["path"] + '"'
+                    "cp " + result_df[url_column] + ' "' + result_df["path"] + '"'
                 )
             else:
                 result_df["s5cmd_cmd"] = (
-                    "cp " + result_df["series_aws_url"] + ' "' + downloadDir + '"'
+                    "cp " + result_df[url_column] + ' "' + downloadDir + '"'
                 )
 
             # Combine all commands into a single string with newline separators
             commands = "\n".join(result_df["s5cmd_cmd"])
-
             manifest_file.write(commands)
 
             if dirTemplate is not None:
@@ -1523,6 +1705,44 @@ def download_from_selection(
             list_of_directories=list_of_directories,
         )
 
+    def download_dicom_instance(
+        self,
+        sopInstanceUID,
+        downloadDir,
+        dry_run=False,
+        quiet=True,
+        show_progress_bar=True,
+        use_s5cmd_sync=False,
+        dirTemplate=DOWNLOAD_HIERARCHY_DEFAULT,
+    ) -> None:
+        """
+        Download the files corresponding to the seriesInstanceUID to the specified directory.
+
+        Args:
+            sopInstanceUID: string or list of strings containing the values of DICOM SOPInstanceUID to filter by
+            downloadDir: string containing the path to the directory to download the files to
+            dry_run: calculates the size of the cohort but download does not start
+            quiet (bool): If True, suppresses the output of the subprocess. Defaults to True.
+            show_progress_bar (bool): If True, tracks the progress of download
+            use_s5cmd_sync (bool): If True, will use s5cmd sync operation instead of cp when downloadDirectory is not empty; this can significantly improve the download speed if the content is partially downloaded
+            dirTemplate (str): Download directory hierarchy template. This variable defines the folder hierarchy for the organizing the downloaded files in downloadDirectory. Defaults to index.DOWNLOAD_HIERARCHY_DEFAULT set to %collection_id/%PatientID/%StudyInstanceUID/%Modality_%SeriesInstanceUID. The template string can be built using a combination of selected metadata attributes (PatientID, collection_id, Modality, StudyInstanceUID, SeriesInstanceUID) that must be prefixed by '%'. The following special characters can be used as separators: '-' (hyphen), '/' (slash for subdirectories), '_' (underscore). When set to None all files will be downloaded to the download directory with no subdirectories.
+
+        Returns: None
+
+        Raises:
+            TypeError: If sopInstanceUID(s) passed is(are) not a string or list
+
+        """
+        self.download_from_selection(
+            downloadDir,
+            sopInstanceUID=sopInstanceUID,
+            dry_run=dry_run,
+            quiet=quiet,
+            show_progress_bar=show_progress_bar,
+            use_s5cmd_sync=use_s5cmd_sync,
+            dirTemplate=dirTemplate,
+        )
+
     def download_dicom_series(
         self,
         seriesInstanceUID,
@@ -1689,4 +1909,11 @@ def sql_query(self, sql_query):
         """
 
         index = self.index
+        # TODO: find a more elegant way to automate the following
+        if hasattr(self, "sm_index"):
+            sm_index = self.sm_index
+        if hasattr(self, "sm_instance_index"):
+            sm_instance_index = self.sm_instance_index
+        if hasattr(self, "clinical_index"):
+            clinical_index = self.clinical_index
         return duckdb.query(sql_query).to_df()
diff --git a/pyproject.toml b/pyproject.toml
index 92304920..f48c9ad0 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -34,7 +34,7 @@ dynamic = ["version"]
 dependencies = [
   "click",
   'duckdb>=0.10.0',
-  "idc-index-data==18.1.0",
+  "idc-index-data==19.0.1",
   "packaging",
   "pandas<2.2",
   "psutil",
diff --git a/tests/idcindex.py b/tests/idcindex.py
index 168ac1f0..4038afdb 100644
--- a/tests/idcindex.py
+++ b/tests/idcindex.py
@@ -9,6 +9,7 @@
 
 import pandas as pd
 import pytest
+import requests
 from click.testing import CliRunner
 from idc_index import IDCClient, cli
 
@@ -18,6 +19,17 @@
 logging.basicConfig(level=logging.DEBUG)
 
 
+def remote_file_exists(url):
+    try:
+        response = requests.head(url, allow_redirects=True)
+        # Check if the status code indicates success
+        return response.status_code == 200
+    except requests.RequestException as e:
+        # Handle any exceptions (e.g., network issues)
+        print(f"An error occurred: {e}")
+        return False
+
+
 @pytest.fixture(autouse=True)
 def _change_test_dir(request, monkeypatch):
     monkeypatch.chdir(request.fspath.dirname)
@@ -174,6 +186,17 @@ def test_download_dicom_series(self):
             )
             self.assertEqual(sum([len(files) for r, d, files in os.walk(temp_dir)]), 3)
 
+    def test_download_dicom_instance(self):
+        i = IDCClient()
+        i.fetch_index("sm_instance_index")
+        with tempfile.TemporaryDirectory() as temp_dir:
+            self.client.download_dicom_instance(
+                sopInstanceUID="1.3.6.1.4.1.5962.99.1.528744472.1087975700.1641206284312.14.0",
+                downloadDir=temp_dir,
+            )
+
+            self.assertEqual(sum([len(files) for r, d, files in os.walk(temp_dir)]), 1)
+
     def test_download_with_template(self):
         dirTemplateValues = [
             None,
@@ -481,11 +504,7 @@ def test_prior_version_manifest(self):
             # is fully resolved, the manifest below should not be empty, and this test should be updated
             # with count equal to 5
             with open(temp_manifest_file) as file:
-                assert len(file.readlines()) == 0
-
-    def test_list_indices(self):
-        i = IDCClient()
-        assert i.indices_overview  # assert that dict was created
+                assert len(file.readlines()) == 5
 
     def test_fetch_index(self):
         i = IDCClient()
@@ -494,6 +513,12 @@ def test_fetch_index(self):
         assert i.indices_overview["sm_index"]["installed"] is True
         assert hasattr(i, "sm_index")
 
+    def test_indices_urls(self):
+        i = IDCClient()
+        for index in i.indices_overview:
+            if i.indices_overview[index]["url"] is not None:
+                assert remote_file_exists(i.indices_overview[index]["url"])
+
 
 if __name__ == "__main__":
     unittest.main()

From 5a9093b18b632b3b732f48d184da4d71be6dba50 Mon Sep 17 00:00:00 2001
From: ds_93 <daniela.schacherer@web.de>
Date: Thu, 19 Sep 2024 17:51:57 +0200
Subject: [PATCH 02/13] Revert "fixed error"

This reverts commit eda4f82d25e04319bc2ae70ab58d0038941d6474.
---
 idc_index/index.py | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/idc_index/index.py b/idc_index/index.py
index fbbe7694..e70aac38 100644
--- a/idc_index/index.py
+++ b/idc_index/index.py
@@ -1593,8 +1593,22 @@ def download_from_selection(
             total_size = round(result_df["series_size_MB"].sum(), 2)
         else:
             total_size_bytes = round(result_df["instance_size"].sum(), 2)
-            total_size = total_size_bytes / (10**6)  # in MB
-        logger.info("Total size of files to download: " + self._format_size(total_size))
+            logger.info(
+                "Total size of files to download: "
+                + self._format_size(total_size_bytes, size_in_bytes=True)
+            )
+
+        # disk_free_space_MB = psutil.disk_usage(downloadDir).free / (1000 * 1000)
+        # if disk_free_space_MB < total_size:
+        #    logger.error("Not enough free space on disk to download the files.")
+        #    logger.error(
+        #        "Total size of files to download: " + self._format_size(total_size)
+        #    )
+        #    logger.error(
+        #        "Total free space on disk: " + self._format_size(disk_free_space_MB)
+        #    )
+        #    return
+
         logger.info(
             "Total free space on disk: "
             + str(psutil.disk_usage(downloadDir).free / (1000 * 1000 * 1000))

From 585c8592aa5e6226d8b0fd75168b3f575e5970ef Mon Sep 17 00:00:00 2001
From: ds_93 <daniela.schacherer@web.de>
Date: Fri, 20 Sep 2024 11:16:07 +0200
Subject: [PATCH 03/13] Revert "removed hardcoded idc-index-data version"

This reverts commit 111c01c01ac37ed9943ea5ecf762ebbcffb96a74.
---
 idc_index/index.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/idc_index/index.py b/idc_index/index.py
index e70aac38..5848486d 100644
--- a/idc_index/index.py
+++ b/idc_index/index.py
@@ -22,6 +22,11 @@
 aws_endpoint_url = "https://s3.amazonaws.com"
 gcp_endpoint_url = "https://storage.googleapis.com"
 asset_endpoint_url = f"https://github.com/ImagingDataCommons/idc-index-data/releases/download/{idc_index_data.__version__}"
+# TODO: remove later
+asset_endpoint_url = (
+    "https://github.com/ImagingDataCommons/idc-index-data/releases/download/18.2.0"
+)
+
 
 logging.basicConfig(format="%(asctime)s - %(message)s", level=logging.INFO)
 logger = logging.getLogger(__name__)

From 8b25255e82c1ac0260f5985f994a3f56a1f21174 Mon Sep 17 00:00:00 2001
From: ds_93 <daniela.schacherer@web.de>
Date: Fri, 20 Sep 2024 11:20:37 +0200
Subject: [PATCH 04/13] Revert "Revert "removed hardcoded idc-index-data
 version""

This reverts commit 585c8592aa5e6226d8b0fd75168b3f575e5970ef.
---
 idc_index/index.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/idc_index/index.py b/idc_index/index.py
index 5848486d..e70aac38 100644
--- a/idc_index/index.py
+++ b/idc_index/index.py
@@ -22,11 +22,6 @@
 aws_endpoint_url = "https://s3.amazonaws.com"
 gcp_endpoint_url = "https://storage.googleapis.com"
 asset_endpoint_url = f"https://github.com/ImagingDataCommons/idc-index-data/releases/download/{idc_index_data.__version__}"
-# TODO: remove later
-asset_endpoint_url = (
-    "https://github.com/ImagingDataCommons/idc-index-data/releases/download/18.2.0"
-)
-
 
 logging.basicConfig(format="%(asctime)s - %(message)s", level=logging.INFO)
 logger = logging.getLogger(__name__)

From 8cf5722dfe3f1c70cc9ff689f28aaa6a3716846b Mon Sep 17 00:00:00 2001
From: ds_93 <daniela.schacherer@web.de>
Date: Fri, 20 Sep 2024 11:26:22 +0200
Subject: [PATCH 05/13] re-done merging of format_size functions

---
 idc_index/index.py | 19 +++++++------------
 1 file changed, 7 insertions(+), 12 deletions(-)

diff --git a/idc_index/index.py b/idc_index/index.py
index e70aac38..04a733a6 100644
--- a/idc_index/index.py
+++ b/idc_index/index.py
@@ -1001,7 +1001,7 @@ def _track_download_progress(
 
             logger.info(
                 "Initial size of the directory: %s",
-                IDCClient._format_size_bytes(initial_size_bytes),
+                IDCClient._format_size(initial_size_bytes, size_in_bytes=True),
             )
             logger.info(
                 "Approximate size of the files that need to be downloaded: %s",
@@ -1333,26 +1333,21 @@ def _s5cmd_run(
                     logger.info("Successfully downloaded files to %s", str(downloadDir))
 
     @staticmethod
-    def _format_size(size_MB):
+    def _format_size(size, size_in_bytes: bool = False):
+        if size_in_bytes:
+            size_MB = size / (10**6)
+        else:
+            size_MB = size
         size_GB = size_MB / 1000
         size_TB = size_GB / 1000
 
         if size_TB >= 1:
             return f"{round(size_TB, 2)} TB"
-        if size_GB >= 1:
-            return f"{round(size_GB, 2)} GB"
-        return f"{round(size_MB, 2)} MB"
-
-    @staticmethod
-    def _format_size_bytes(size_bytes):
-        size_MB = size_bytes / (10**6)
-        size_GB = size_MB / 1000
-
         if size_GB >= 1:
             return f"{round(size_GB, 2)} GB"
         if size_MB >= 1:
             return f"{round(size_MB, 2)} MB"
-        return f"{round(size_bytes, 2)} bytes"
+        return f"{round(size, 2)} bytes"
 
     def download_from_manifest(
         self,

From a1f272dd53e573a824797fcad84a65a2829e5edc Mon Sep 17 00:00:00 2001
From: ds_93 <daniela.schacherer@web.de>
Date: Fri, 20 Sep 2024 11:40:16 +0200
Subject: [PATCH 06/13] fixed variable used before assignment

---
 idc_index/index.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/idc_index/index.py b/idc_index/index.py
index 43dd3f71..50820be4 100644
--- a/idc_index/index.py
+++ b/idc_index/index.py
@@ -1587,10 +1587,10 @@ def download_from_selection(
         if not sopInstanceUID:
             total_size = round(result_df["series_size_MB"].sum(), 2)
         else:
-            total_size_bytes = round(result_df["instance_size"].sum(), 2)
+            total_size = round(result_df["instance_size"].sum(), 2)
             logger.info(
                 "Total size of files to download: "
-                + self._format_size(total_size_bytes, size_in_bytes=True)
+                + self._format_size(total_size, size_in_bytes=True)
             )
 
         disk_free_space_MB = psutil.disk_usage(downloadDir).free / (1000 * 1000)

From d49f375ace1b207aafd677e6c9c2ed90df3800a9 Mon Sep 17 00:00:00 2001
From: ds_93 <daniela.schacherer@web.de>
Date: Fri, 20 Sep 2024 12:07:54 +0200
Subject: [PATCH 07/13] fixed false not enough free space error

---
 idc_index/index.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/idc_index/index.py b/idc_index/index.py
index 50820be4..4e1075df 100644
--- a/idc_index/index.py
+++ b/idc_index/index.py
@@ -1587,11 +1587,12 @@ def download_from_selection(
         if not sopInstanceUID:
             total_size = round(result_df["series_size_MB"].sum(), 2)
         else:
-            total_size = round(result_df["instance_size"].sum(), 2)
+            total_size_bytes = round(result_df["instance_size"].sum(), 2)
             logger.info(
                 "Total size of files to download: "
-                + self._format_size(total_size, size_in_bytes=True)
+                + self._format_size(total_size_bytes, size_in_bytes=True)
             )
+            total_size = total_size_bytes / (10**6)
 
         disk_free_space_MB = psutil.disk_usage(downloadDir).free / (1000 * 1000)
         if disk_free_space_MB < total_size:

From bbef810a398a6cb9923997daca37ffb318d82c07 Mon Sep 17 00:00:00 2001
From: ds_93 <daniela.schacherer@web.de>
Date: Mon, 30 Sep 2024 12:00:08 +0200
Subject: [PATCH 08/13] removed merge with main index when adding new index

---
 idc_index/index.py | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/idc_index/index.py b/idc_index/index.py
index 4e1075df..5a0b7b35 100644
--- a/idc_index/index.py
+++ b/idc_index/index.py
@@ -263,13 +263,8 @@ def fetch_index(self, index) -> None:
                 with open(filepath, mode="wb") as file:
                     file.write(response.content)
 
-                # Join new index with main index
-                sm_instance_index = pd.read_parquet(filepath)
-                sm_instance_index = sm_instance_index.merge(
-                    self.index, on="SeriesInstanceUID", how="left"
-                )
-
-                setattr(self.__class__, index, sm_instance_index)
+                index_table = pd.read_parquet(filepath)
+                setattr(self.__class__, index, index_table)
                 self.indices_overview[index]["installed"] = True
 
             else:

From e93bad2933f135e6bb5771b62b2733f1ab7206d4 Mon Sep 17 00:00:00 2001
From: ds_93 <daniela.schacherer@web.de>
Date: Mon, 30 Sep 2024 12:47:54 +0200
Subject: [PATCH 09/13] modified sql query in download_from_selection in case
 of instance download to be able to catch attributes necessary for hierarchy
 template

---
 idc_index/index.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/idc_index/index.py b/idc_index/index.py
index 5a0b7b35..8623e785 100644
--- a/idc_index/index.py
+++ b/idc_index/index.py
@@ -264,6 +264,10 @@ def fetch_index(self, index) -> None:
                     file.write(response.content)
 
                 index_table = pd.read_parquet(filepath)
+                # index_table = index_table.merge(
+                #    self.index[["series_aws_url", "SeriesInstanceUID"]],
+                #    on="SeriesInstanceUID", how="left"
+                # )
                 setattr(self.__class__, index, index_table)
                 self.indices_overview[index]["installed"] = True
 
@@ -1617,6 +1621,7 @@ def download_from_selection(
                 downloadDir=downloadDir,
                 dirTemplate=dirTemplate,
             )
+
             if sopInstanceUID:
                 sql = f"""
                     WITH temp as
@@ -1633,6 +1638,8 @@ def download_from_selection(
                         temp
                     JOIN
                         sm_instance_index using (sopInstanceUID)
+                    JOIN
+                        index using (seriesInstanceUID)
                     """
             else:
                 sql = f"""

From 86bb42d8297a2a6e53c303b0c8fd5bfc34b78a7d Mon Sep 17 00:00:00 2001
From: ds_93 <daniela.schacherer@web.de>
Date: Tue, 1 Oct 2024 11:29:38 +0200
Subject: [PATCH 10/13] trim trailing whitespaces fix

---
 idc_index/index.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/idc_index/index.py b/idc_index/index.py
index ee987c85..f87ea432 100644
--- a/idc_index/index.py
+++ b/idc_index/index.py
@@ -1594,7 +1594,7 @@ def download_from_selection(
             )
         else:
             hierarchy = f"CONCAT('{downloadDir}')"
-            
+
         if sopInstanceUID:
             sql = f"""
                 WITH temp as

From 2f81497acf1d5385488f3f7c84b048ffa0221c4d Mon Sep 17 00:00:00 2001
From: ds_93 <daniela.schacherer@web.de>
Date: Tue, 1 Oct 2024 11:39:21 +0200
Subject: [PATCH 11/13] added index_crdc_series_uuid

---
 idc_index/index.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/idc_index/index.py b/idc_index/index.py
index f87ea432..397f93b3 100644
--- a/idc_index/index.py
+++ b/idc_index/index.py
@@ -1607,6 +1607,7 @@ def download_from_selection(
                 SELECT
                     series_aws_url,
                     CONCAT(TRIM('*' FROM series_aws_url), crdc_instance_uuid, '.dcm') as instance_url,
+                    REGEXP_EXTRACT(series_aws_url, '(?:.*?\\/){{3}}([^\\/?#]+)', 1) index_crdc_series_uuid,
                     series_size_MB,
                     {hierarchy} as path
                 FROM

From ef0101d0a002be8d30ad4f40b6b7ba6dc03a516b Mon Sep 17 00:00:00 2001
From: Andrey Fedorov <andrey.fedorov@gmail.com>
Date: Mon, 7 Oct 2024 15:34:41 -0400
Subject: [PATCH 12/13] BUG: clean up after conflict resolution

---
 idc_index/index.py | 12 +++++-------
 tests/idcindex.py  |  3 +++
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/idc_index/index.py b/idc_index/index.py
index 0d113bc1..c6d0ea5b 100644
--- a/idc_index/index.py
+++ b/idc_index/index.py
@@ -153,7 +153,7 @@ def _safe_filter_by_selection(
         patientId,
         studyInstanceUID,
         seriesInstanceUID,
-        sopInstanceUID=None,
+        sopInstanceUID,
         crdc_series_uuid,
     ):
         if collection_id is not None:
@@ -205,10 +205,10 @@ def _safe_filter_by_selection(
 
         if sopInstanceUID is not None:
             result_df = IDCClient._filter_by_dicom_instance_uid(
-                result_df, sopInstanceUID
+                df_index, sopInstanceUID
             )
             return result_df
-          
+
         if seriesInstanceUID is not None:
             result_df = IDCClient._filter_by_dicom_series_uid(
                 df_index, seriesInstanceUID
@@ -1512,6 +1512,7 @@ def citations_from_selection(
             patientId=patientId,
             studyInstanceUID=studyInstanceUID,
             seriesInstanceUID=seriesInstanceUID,
+            sopInstanceUID=None,
             crdc_series_uuid=None,
         )
 
@@ -1604,10 +1605,7 @@ def download_from_selection(
                 raise ValueError(
                     "Instance-level access not possible because instance-level index not installed."
                 )
-        else:
-            download_df = self.index
-
-        if crdc_series_uuid is not None:
+        elif crdc_series_uuid is not None:
             download_df = pd.concat(
                 [
                     self.index[
diff --git a/tests/idcindex.py b/tests/idcindex.py
index 18c8311a..66b875b9 100644
--- a/tests/idcindex.py
+++ b/tests/idcindex.py
@@ -15,6 +15,9 @@
 
 # Run tests using the following command from the root of the repository:
 # python -m unittest -vv tests/idcindex.py
+#
+# run specific tests with this:
+# pytest ./tests/idcindex.py::TestIDCClient.test_download_dicom_instance
 
 logging.basicConfig(level=logging.DEBUG)
 

From 5af1112dd767419358d2ee70e1222b58cdaede0d Mon Sep 17 00:00:00 2001
From: Andrey Fedorov <andrey.fedorov@gmail.com>
Date: Tue, 8 Oct 2024 10:27:55 -0400
Subject: [PATCH 13/13] ENH: update to disable sync for instance-level download

We do not currently support instance-level manifests as input
to the command-line download tool. We also do not provide any mechanism
to generate instance-level manifests from the portal. Sync operation is
using series level size for estimating progress. Implementing support
for syncing instance-level manifests will take work, which would not be
justified: for now, I think it is safe to assume instance-level download
will only be invoked by passing SOPInstanceUID to the functions/download
tool, and with just a few instances/files, syncing those instead of copy
won't bring much benefit.
---
 idc_index/index.py | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/idc_index/index.py b/idc_index/index.py
index c6d0ea5b..f6f85385 100644
--- a/idc_index/index.py
+++ b/idc_index/index.py
@@ -1605,6 +1605,11 @@ def download_from_selection(
                 raise ValueError(
                     "Instance-level access not possible because instance-level index not installed."
                 )
+            if use_s5cmd_sync:
+                logger.warning(
+                    "s5cmd sync is not supported for downloading individual files. Disabling sync."
+                )
+                use_s5cmd_sync = False
         elif crdc_series_uuid is not None:
             download_df = pd.concat(
                 [
@@ -1699,9 +1704,8 @@ def download_from_selection(
                     )
                 SELECT
                     series_aws_url,
-                    CONCAT(TRIM('*' FROM series_aws_url), crdc_instance_uuid, '.dcm') as instance_url,
+                    CONCAT(TRIM('*' FROM series_aws_url), crdc_instance_uuid, '.dcm') as instance_aws_url,
                     REGEXP_EXTRACT(series_aws_url, '(?:.*?\\/){{3}}([^\\/?#]+)', 1) index_crdc_series_uuid,
-                    series_size_MB,
                     {hierarchy} as path
                 FROM
                     temp
@@ -1735,13 +1739,13 @@ def download_from_selection(
         with tempfile.NamedTemporaryFile(mode="w", delete=False) as manifest_file:
             # Determine column containing the URL for instance / series-level access
             if sopInstanceUID:
-                if not "instance_url" in result_df:
-                    result_df["instance_url"] = (
+                if not "instance_aws_url" in result_df:
+                    result_df["instance_aws_url"] = (
                         result_df["series_aws_url"].replace("/*", "/")
                         + result_df["crdc_instance_uuid"]
                         + ".dcm"
                     )
-                url_column = "instance_url"
+                url_column = "instance_aws_url"
             else:
                 url_column = "series_aws_url"
 
@@ -1776,6 +1780,12 @@ def download_from_selection(
 Temporary download manifest is generated and is passed to self._s5cmd_run
 """
         )
+        if sopInstanceUID:
+            s5cmd_sync_helper_df = None
+        else:
+            s5cmd_sync_helper_df = result_df[
+                ["index_crdc_series_uuid", "s5cmd_cmd", "series_size_MB", "path"]
+            ]
         self._s5cmd_run(
             endpoint_to_use=aws_endpoint_url,
             manifest_file=Path(manifest_file.name),
@@ -1786,9 +1796,7 @@ def download_from_selection(
             use_s5cmd_sync=use_s5cmd_sync,
             dirTemplate=dirTemplate,
             list_of_directories=list_of_directories,
-            s5cmd_sync_helper_df=result_df[
-                ["index_crdc_series_uuid", "s5cmd_cmd", "series_size_MB", "path"]
-            ],
+            s5cmd_sync_helper_df=s5cmd_sync_helper_df,
         )
 
     def download_dicom_instance(