From 6b7b41cb191fdf4b4222284156f06ff80a2a5622 Mon Sep 17 00:00:00 2001 From: Andrey Fedorov Date: Fri, 2 Aug 2024 17:50:18 -0400 Subject: [PATCH 01/13] STYLE: remove DF conversion function for the index overview In the future, we can add convenience conversion util function that could serve same purpose without being attached to a specific class variable parent aa021d5b05579f2665dec2212c26aed4b92fc56a author ds_93 1723195062 +0200 committer ds_93 1726760294 +0200 parent aa021d5b05579f2665dec2212c26aed4b92fc56a author ds_93 1723195062 +0200 committer ds_93 1726760288 +0200 BUG: fixed codespell complaint BUG: fixed pylint errors BUG: fixed pylint errors BUG: fixed test ENH: added printing configurations for codespell added download size calculation for single instance download. Changed tqdm bar data to be displayed in GB/MB instead of bytes ENH: enable downloading data in manifests from previous idc versions ENH: add description for the previous versions index ENH: fix error messages for items not identified in the current version ENH: add clinical_index also added checks for existence of the URLs containing remote indices BUG: use trim to remove any extraneous spaces while parsing s3 url in manifest ENH: simplify s3_url extraction update to simplify to use clinical_index from idc-index clarify wording of a section BUG: remove notebook comitted by accident from Colab BUG: fix viewer series selection parameter SeriesInstanceUID changed to SeriesInstanceUIDs, see https://docs.ohif.org/migration-guide/from-3p7-to-3p8#studyinstanceuid-in-the-url-param ENH: simplify download and create destination directory if needed Automatic creation of the destination directory mimics the behavior of s5cmd and simplifies usage DOC: fix explanation of the dirTemplate parameter fixed bug fixed bug second try wip ENH: update to IDC v19 ENH: upgrade idc-index-data and use query-based clinical_index removed hardcoded idc-index-data version --- .pre-commit-config.yaml | 1 + idc_index/cli.py | 4 +- idc_index/index.py | 409 +++++++++++++++++++++++++++++++--------- pyproject.toml | 2 +- tests/idcindex.py | 35 +++- 5 files changed, 352 insertions(+), 99 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index a8ce7587..b1b5f058 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -60,6 +60,7 @@ repos: rev: "v2.2.6" hooks: - id: codespell + args: ["--quiet-level 3"] - repo: https://github.com/shellcheck-py/shellcheck-py rev: "v0.9.0.6" diff --git a/idc_index/cli.py b/idc_index/cli.py index 09fed46c..3171276b 100644 --- a/idc_index/cli.py +++ b/idc_index/cli.py @@ -113,7 +113,7 @@ def set_log_level(log_level): "--dir-template", type=str, default=IDCClient.DOWNLOAD_HIERARCHY_DEFAULT, - help="Download directory hierarchy template. This variable defines the folder hierarchy for the organizing the downloaded files in downloadDirectory. Defaults to index.DOWNLOAD_HIERARCHY_DEFAULT set to %collection_id/%PatientID/%StudyInstanceUID/%Modality_%SeriesInstanceUID. The template string can be built using a combination of selected metadata attributes (PatientID, collection_id, Modality, StudyInstanceUID, SeriesInstanceUID) that must be prefixed by '%'. The following special characters can be used as separators: '-' (hyphen), '/' (slash for subdirectories), '_' (underscore). When set to None all files will be downloaded to the download directory with no subdirectories.", + help="Download directory hierarchy template. This variable defines the folder hierarchy for the organizing the downloaded files in downloadDirectory. Defaults to index.DOWNLOAD_HIERARCHY_DEFAULT set to %collection_id/%PatientID/%StudyInstanceUID/%Modality_%SeriesInstanceUID. The template string can be built using a combination of selected metadata attributes (PatientID, collection_id, Modality, StudyInstanceUID, SeriesInstanceUID) that must be prefixed by '%'. The following special characters can be used as separators: '-' (hyphen), '/' (slash for subdirectories), '_' (underscore). When set to empty string (\"\") all files will be downloaded to the download directory with no subdirectories.", ) def download_from_selection( download_dir, @@ -233,7 +233,7 @@ def download_from_selection( "--dir-template", type=str, default=IDCClient.DOWNLOAD_HIERARCHY_DEFAULT, - help="Download directory hierarchy template. This variable defines the folder hierarchy for the organizing the downloaded files in downloadDirectory. Defaults to index.DOWNLOAD_HIERARCHY_DEFAULT set to %collection_id/%PatientID/%StudyInstanceUID/%Modality_%SeriesInstanceUID. The template string can be built using a combination of selected metadata attributes (PatientID, collection_id, Modality, StudyInstanceUID, SeriesInstanceUID) that must be prefixed by '%'. The following special characters can be used as separators: '-' (hyphen), '/' (slash for subdirectories), '_' (underscore). When set to None all files will be downloaded to the download directory with no subdirectories.", + help="Download directory hierarchy template. This variable defines the folder hierarchy for the organizing the downloaded files in downloadDirectory. Defaults to index.DOWNLOAD_HIERARCHY_DEFAULT set to %collection_id/%PatientID/%StudyInstanceUID/%Modality_%SeriesInstanceUID. The template string can be built using a combination of selected metadata attributes (PatientID, collection_id, Modality, StudyInstanceUID, SeriesInstanceUID) that must be prefixed by '%'. The following special characters can be used as separators: '-' (hyphen), '/' (slash for subdirectories), '_' (underscore). When set to empty string (\"\") all files will be downloaded to the download directory with no subdirectories.", ) def download_from_manifest( manifest_file, diff --git a/idc_index/index.py b/idc_index/index.py index 5cc76be8..fbbe7694 100644 --- a/idc_index/index.py +++ b/idc_index/index.py @@ -41,7 +41,7 @@ class IDCClient: CITATION_FORMAT_BIBTEX = "application/x-bibtex" # Singleton pattern - # NOTE: In the future, one may want to use multiple clients e.g. for sub-datasets so a attribute-singleton as shown bewlo seems a better option. + # NOTE: In the future, one may want to use multiple clients e.g. for sub-datasets so a attribute-singleton as shown below seems a better option. # _instance: IDCClient # def __new__(cls): # if not hasattr(cls, "_instance") or getattr(cls, "_instance") is None: @@ -63,18 +63,30 @@ def __init__(self): file_path = idc_index_data.IDC_INDEX_PARQUET_FILEPATH logger.debug(f"Reading index file v{idc_index_data.__version__}") self.index = pd.read_parquet(file_path) + + self.previous_versions_index_path = ( + idc_index_data.PRIOR_VERSIONS_INDEX_PARQUET_FILEPATH + ) + # self.index = self.index.astype(str).replace("nan", "") self.index["series_size_MB"] = self.index["series_size_MB"].astype(float) self.collection_summary = self.index.groupby("collection_id").agg( {"Modality": pd.Series.unique, "series_size_MB": "sum"} ) + idc_version = f"v{Version(idc_index_data.__version__).major}" + self.indices_overview = { "index": { "description": "Main index containing one row per DICOM series.", "installed": True, "url": None, }, + "previous_versions_index": { + "description": "index containing one row per DICOM series from all previous IDC versions that are not in current version.", + "installed": True, + "url": None, + }, "sm_index": { "description": "DICOM Slide Microscopy series-level index.", "installed": False, @@ -85,6 +97,11 @@ def __init__(self): "installed": False, "url": f"{asset_endpoint_url}/sm_instance_index.parquet", }, + "clinical_index": { + "description": "Index of clinical data accompanying the available images.", + "installed": False, + "url": f"{asset_endpoint_url}/clinical_index.parquet", + }, } # Lookup s5cmd @@ -119,7 +136,12 @@ def _filter_dataframe_by_id(key, dataframe, _id): @staticmethod def _safe_filter_by_selection( - df_index, collection_id, patientId, studyInstanceUID, seriesInstanceUID + df_index, + collection_id, + patientId, + studyInstanceUID, + seriesInstanceUID, + sopInstanceUID=None, ): if collection_id is not None: if not isinstance(collection_id, str) and not isinstance( @@ -139,6 +161,11 @@ def _safe_filter_by_selection( seriesInstanceUID, list ): raise TypeError("seriesInstanceUID must be a string or list of strings") + if sopInstanceUID is not None: + if not isinstance(sopInstanceUID, str) and not isinstance( + sopInstanceUID, list + ): + raise TypeError("sopInstanceUID must be a string or list of strings") if collection_id is not None: result_df = IDCClient._filter_by_collection_id(df_index, collection_id) @@ -158,6 +185,11 @@ def _safe_filter_by_selection( result_df, seriesInstanceUID ) + if sopInstanceUID is not None: + result_df = IDCClient._filter_by_dicom_instance_uid( + result_df, sopInstanceUID + ) + return result_df @staticmethod @@ -182,6 +214,12 @@ def _filter_by_dicom_series_uid(df_index, dicom_series_uid): "SeriesInstanceUID", df_index, dicom_series_uid ) + @staticmethod + def _filter_by_dicom_instance_uid(df_index, dicom_instance_uid): + return IDCClient._filter_dataframe_by_id( + "SOPInstanceUID", df_index, dicom_instance_uid + ) + @staticmethod def get_idc_version(): """ @@ -190,19 +228,19 @@ def get_idc_version(): idc_version = Version(idc_index_data.__version__).major return f"v{idc_version}" - def list_indices(self): + @staticmethod + def _check_create_directory(download_dir): """ - Lists all available indices including their installation status. - - Returns: - indices_overview (pd.DataFrame): DataFrame containing information per index. + Mimic behavior of s5cmd and create the download directory if it does not exist """ + download_dir = Path(download_dir) + download_dir.mkdir(parents=True, exist_ok=True) - return pd.DataFrame.from_dict(self.indices_overview, orient="index") + return str(download_dir.resolve()) def fetch_index(self, index) -> None: """ - Downloads requested index. + Downloads requested index and adds this index joined with the main index as respective class attribute. Args: index (str): Name of the index to be downloaded. @@ -221,10 +259,19 @@ def fetch_index(self, index) -> None: idc_index_data.IDC_INDEX_PARQUET_FILEPATH.parents[0], f"{index}.parquet", ) + with open(filepath, mode="wb") as file: file.write(response.content) - setattr(self.__class__, index, pd.read_parquet(filepath)) + + # Join new index with main index + sm_instance_index = pd.read_parquet(filepath) + sm_instance_index = sm_instance_index.merge( + self.index, on="SeriesInstanceUID", how="left" + ) + + setattr(self.__class__, index, sm_instance_index) self.indices_overview[index]["installed"] = True + else: logger.error( f"Failed to fetch index from URL {self.indices_overview[index]['url']}: {response.status_code}" @@ -501,7 +548,7 @@ def get_viewer_URL( ohif_v2, ohif_v3, or slim. If not provided, default viewers will be used. Returns: - string containing the IDC viewer URL for the given SeriesInstanceUID + string containing the IDC viewer URL for the requested selection """ if seriesInstanceUID is None and studyInstanceUID is None: @@ -574,7 +621,7 @@ def get_viewer_URL( if seriesInstanceUID is None: viewer_url = f"https://viewer.imaging.datacommons.cancer.gov/v3/viewer/?StudyInstanceUIDs={studyInstanceUID}" else: - viewer_url = f"https://viewer.imaging.datacommons.cancer.gov/v3/viewer/?StudyInstanceUIDs={studyInstanceUID}&SeriesInstanceUID={seriesInstanceUID}" + viewer_url = f"https://viewer.imaging.datacommons.cancer.gov/v3/viewer/?StudyInstanceUIDs={studyInstanceUID}&SeriesInstanceUIDs={seriesInstanceUID}" elif viewer_selector == "volview": # TODO! Not implemented yet viewer_url = None @@ -627,6 +674,14 @@ def _validate_update_manifest_and_get_download_size( # create a copy of the index index_df_copy = self.index + # use default hierarchy + if dirTemplate is not None: + hierarchy = self._generate_sql_concat_for_building_directory( + dirTemplate=dirTemplate, downloadDir=downloadDir + ) + else: + hierarchy = "NULL" + # Extract s3 url and crdc_series_uuid from the manifest copy commands # Next, extract crdc_series_uuid from aws_series_url in the index and # try to verify if every series in the manifest is present in the index @@ -634,7 +689,7 @@ def _validate_update_manifest_and_get_download_size( # TODO: need to remove the assumption that manifest commands will have 'cp' # and need to parse S3 URL directly # ruff: noqa - sql = """ + sql = f""" PRAGMA disable_progress_bar; WITH index_temp AS ( @@ -642,19 +697,21 @@ def _validate_update_manifest_and_get_download_size( seriesInstanceUID, series_aws_url, series_size_MB, - REGEXP_EXTRACT(series_aws_url, '(?:.*?\\/){3}([^\\/?#]+)', 1) index_crdc_series_uuid + {hierarchy} AS path, + REGEXP_EXTRACT(series_aws_url, '(?:.*?\\/){{3}}([^\\/?#]+)', 1) index_crdc_series_uuid FROM index_df_copy), manifest_temp AS ( SELECT manifest_cp_cmd, - REGEXP_EXTRACT(manifest_cp_cmd, '(?:.*?\\/){3}([^\\/?#]+)', 1) AS manifest_crdc_series_uuid, - REGEXP_REPLACE(regexp_replace(manifest_cp_cmd, 'cp ', ''), '\\s[^\\s]*$', '') AS s3_url, + REGEXP_EXTRACT(manifest_cp_cmd, '(?:.*?\\/){{3}}([^\\/?#]+)', 1) AS manifest_crdc_series_uuid, + REGEXP_EXTRACT(manifest_cp_cmd, 's3://\\S+') AS s3_url, FROM manifest_df ) SELECT seriesInstanceuid, s3_url, + path, series_size_MB, index_crdc_series_uuid is not NULL as crdc_series_uuid_match, s3_url==series_aws_url AS s3_url_match, @@ -682,12 +739,83 @@ def _validate_update_manifest_and_get_download_size( missing_manifest_cp_cmds = merged_df.loc[ ~merged_df["crdc_series_uuid_match"], "manifest_cp_cmd" ] - logger.error( - "The following manifest copy commands are not recognized as referencing any associated series in the index.\n" + missing_in_main_cnt = len(missing_manifest_cp_cmds.tolist()) + logger.warning( + f"The total of {missing_in_main_cnt} copy commands are not recognized as referencing any associated series in the main index.\n" "This means either these commands are invalid, or they may correspond to files available in a release of IDC\n" - f"different from {self.get_idc_version()} used in this version of idc-index. The corresponding files will not be downloaded.\n" + f"different from {self.get_idc_version()} used in this version of idc-index. Prior data releases will be checked next." ) - logger.error("\n" + "\n".join(missing_manifest_cp_cmds.tolist())) + + logger.debug( + "Checking if the requested data is available in other idc versions " + ) + missing_series_sql = f""" + PRAGMA disable_progress_bar; + WITH + combined_index AS + (SELECT + *, + {hierarchy} AS path, + FROM + index_df_copy + union by name + SELECT + *, + {hierarchy} AS path, + FROM + '{self.previous_versions_index_path}' pvip + + ), + index_temp AS ( + SELECT + seriesInstanceUID, + series_aws_url, + series_size_MB, + path, + REGEXP_EXTRACT(series_aws_url, '(?:.*?\\/){{3}}([^\\/?#]+)', 1) index_crdc_series_uuid + FROM + combined_index), + manifest_temp AS ( + SELECT + manifest_cp_cmd, + REGEXP_EXTRACT(manifest_cp_cmd, '(?:.*?\\/){{3}}([^\\/?#]+)', 1) AS manifest_crdc_series_uuid, + REGEXP_REPLACE(regexp_replace(manifest_cp_cmd, 'cp ', ''), '\\s[^\\s]*$', '') AS s3_url, + FROM + manifest_df ) + SELECT + seriesInstanceuid, + s3_url, + path, + series_size_MB, + index_crdc_series_uuid is not NULL as crdc_series_uuid_match, + TRIM(s3_url) = TRIM(series_aws_url) AS s3_url_match, + manifest_temp.manifest_cp_cmd, + CASE + WHEN TRIM(s3_url) = TRIM(series_aws_url) THEN 'aws' + ELSE + 'unknown' + END + AS endpoint + FROM + manifest_temp + LEFT JOIN + index_temp + ON + index_temp.index_crdc_series_uuid = manifest_temp.manifest_crdc_series_uuid + """ + merged_df = duckdb.query(missing_series_sql).df() + if not all(merged_df["crdc_series_uuid_match"]): + missing_manifest_cp_cmds = merged_df.loc[ + ~merged_df["crdc_series_uuid_match"], "manifest_cp_cmd" + ] + logger.error( + "The following manifest copy commands are not recognized as referencing any associated series in any release of IDC.\n" + "This means either these commands are invalid. Please submit an issue on https://github.com/ImagingDataCommons/idc-index/issues \n" + "The corresponding files could not be downloaded.\n" + ) + logger.error("\n" + "\n".join(missing_manifest_cp_cmds.tolist())) + else: + logger.info("All of the identifiers from manifest have been resolved!") if validate_manifest: # Check if there is more than one endpoint @@ -738,29 +866,29 @@ def _validate_update_manifest_and_get_download_size( total_size = merged_df["series_size_MB"].sum() total_size = round(total_size, 2) - if dirTemplate is not None: - hierarchy = self._generate_sql_concat_for_building_directory( - dirTemplate=dirTemplate, downloadDir=downloadDir - ) - sql = f""" - WITH temp as - ( - SELECT - seriesInstanceUID, - s3_url - FROM - merged_df - ) - SELECT - s3_url, - {hierarchy} as path - FROM - temp - JOIN - index using (seriesInstanceUID) - """ - logger.debug(f"About to run this query:\n{sql}") - merged_df = self.sql_query(sql) + # if dirTemplate is not None: + # hierarchy = self._generate_sql_concat_for_building_directory( + # dirTemplate=dirTemplate, downloadDir=downloadDir + # ) + # sql = f""" + # WITH temp as + # ( + # SELECT + # seriesInstanceUID, + # s3_url + # FROM + # merged_df + # ) + # SELECT + # s3_url, + # {hierarchy} as path + # FROM + # temp + # JOIN + # index using (seriesInstanceUID) + # """ + # logger.debug(f"About to run this query:\n{sql}") + # merged_df = self.sql_query(sql) # Write a temporary manifest file with tempfile.NamedTemporaryFile(mode="w", delete=False) as temp_manifest_file: if use_s5cmd_sync and len(os.listdir(downloadDir)) != 0: @@ -865,41 +993,40 @@ def _track_download_progress( runtime_errors = [] if show_progress_bar: - total_size_bytes = size_MB * 10**6 # Convert MB to bytes - # temporary place holder. Accurate size is calculated in the next step + total_size_to_be_downloaded_bytes = size_MB * (10**6) initial_size_bytes = 0 # Calculate the initial size of the directory for directory in list_of_directories: initial_size_bytes = IDCClient._get_dir_sum_file_size(directory) - logger.info("Initial size of the directory: %s bytes", initial_size_bytes) logger.info( - "Approximate size of the files that need to be downloaded: %s bytes", - total_size_bytes, + "Initial size of the directory: %s", + IDCClient._format_size_bytes(initial_size_bytes), + ) + logger.info( + "Approximate size of the files that need to be downloaded: %s", + IDCClient._format_size(size_MB), ) pbar = tqdm( - total=total_size_bytes, + total=total_size_to_be_downloaded_bytes, unit="B", unit_scale=True, desc="Downloading data", ) while True: + time.sleep(0.5) downloaded_bytes = 0 for directory in list_of_directories: downloaded_bytes += IDCClient._get_dir_sum_file_size(directory) downloaded_bytes -= initial_size_bytes pbar.n = min( - downloaded_bytes, total_size_bytes + downloaded_bytes, total_size_to_be_downloaded_bytes ) # Prevent the progress bar from exceeding 100% pbar.refresh() - if process.poll() is not None: break - - time.sleep(0.5) - # Wait for the process to finish _, stderr = process.communicate() pbar.close() @@ -1133,7 +1260,7 @@ def _s5cmd_run( logger.info( f"Requested total download size is {total_size} MB, \ however at least {existing_data_size} MB is already present,\ - so downloading only remaining upto {sync_size} MB\n\ + so downloading only remaining up to {sync_size} MB\n\ Please note that disk sizes are calculated at series level, \ so if individual files are missing, displayed progress bar may\ not be accurate." @@ -1216,6 +1343,17 @@ def _format_size(size_MB): return f"{round(size_GB, 2)} GB" return f"{round(size_MB, 2)} MB" + @staticmethod + def _format_size_bytes(size_bytes): + size_MB = size_bytes / (10**6) + size_GB = size_MB / 1000 + + if size_GB >= 1: + return f"{round(size_GB, 2)} GB" + if size_MB >= 1: + return f"{round(size_MB, 2)} MB" + return f"{round(size_bytes, 2)} bytes" + def download_from_manifest( self, manifestFile: str, @@ -1245,9 +1383,7 @@ def download_from_manifest( ValueError: If the download directory does not exist. """ - downloadDir = os.path.abspath(downloadDir).replace("\\", "/") - if not os.path.exists(downloadDir): - raise ValueError("Download directory does not exist.") + downloadDir = self._check_create_directory(downloadDir) # validate the manifest ( @@ -1402,6 +1538,7 @@ def download_from_selection( patientId=None, studyInstanceUID=None, seriesInstanceUID=None, + sopInstanceUID=None, quiet=True, show_progress_bar=True, use_s5cmd_sync=False, @@ -1417,6 +1554,7 @@ def download_from_selection( patientId: string or list of strings containing the values of PatientID to filter by studyInstanceUID: string or list of strings containing the values of DICOM StudyInstanceUID to filter by seriesInstanceUID: string or list of strings containing the values of DICOM SeriesInstanceUID to filter by + sopInstanceUID: string or list of strings containing the values of DICOM SOPInstanceUID to filter by quiet (bool): If True, suppresses the output of the subprocess. Defaults to True show_progress_bar (bool): If True, tracks the progress of download use_s5cmd_sync (bool): If True, will use s5cmd sync operation instead of cp when downloadDirectory is not empty; this can significantly improve the download speed if the content is partially downloaded @@ -1424,24 +1562,43 @@ def download_from_selection( """ - downloadDir = os.path.abspath(downloadDir).replace("\\", "/") - if not os.path.exists(downloadDir): - raise ValueError("Download directory does not exist.") + downloadDir = self._check_create_directory(downloadDir) + + # If SOPInstanceUID(s) are given, we need to join the main index with the instance-level index + if sopInstanceUID: + if hasattr( + self, "sm_instance_index" + ): # check if instance-level index is installed + index_to_be_filtered = self.sm_instance_index + else: + logger.error( + "Instance-level access not possible because instance-level index not installed." + ) + raise ValueError( + "Instance-level access not possible because instance-level index not installed." + ) + else: + index_to_be_filtered = self.index result_df = self._safe_filter_by_selection( - self.index, + index_to_be_filtered, collection_id=collection_id, patientId=patientId, studyInstanceUID=studyInstanceUID, seriesInstanceUID=seriesInstanceUID, + sopInstanceUID=sopInstanceUID, ) - total_size = round(result_df["series_size_MB"].sum(), 2) + if not sopInstanceUID: + total_size = round(result_df["series_size_MB"].sum(), 2) + else: + total_size_bytes = round(result_df["instance_size"].sum(), 2) + total_size = total_size_bytes / (10**6) # in MB logger.info("Total size of files to download: " + self._format_size(total_size)) logger.info( "Total free space on disk: " + str(psutil.disk_usage(downloadDir).free / (1000 * 1000 * 1000)) - + "GB" + + " GB" ) if dry_run: @@ -1455,51 +1612,76 @@ def download_from_selection( downloadDir=downloadDir, dirTemplate=dirTemplate, ) - sql = f""" - WITH temp as - ( - SELECT - seriesInstanceUID - FROM - result_df - ) - SELECT - series_aws_url, - {hierarchy} as path - FROM - temp - JOIN - index using (seriesInstanceUID) - """ + if sopInstanceUID: + sql = f""" + WITH temp as + ( + SELECT + sopInstanceUID + FROM + result_df + ) + SELECT + CONCAT(TRIM('*' FROM series_aws_url), crdc_instance_uuid, '.dcm') as instance_url, + CONCAT({hierarchy}, '/') as path + FROM + temp + JOIN + sm_instance_index using (sopInstanceUID) + """ + else: + sql = f""" + WITH temp as + ( + SELECT + seriesInstanceUID + FROM + result_df + ) + SELECT + series_aws_url, + {hierarchy} as path + FROM + temp + JOIN + index using (seriesInstanceUID) + """ result_df = self.sql_query(sql) - # Download the files - # make temporary file to store the list of files to download + # Download the files and make temporary file to store the list of files to download + with tempfile.NamedTemporaryFile(mode="w", delete=False) as manifest_file: + # Determine column containing the URL for instance / series-level access + if sopInstanceUID: + if not "instance_url" in result_df: + result_df["instance_url"] = ( + result_df["series_aws_url"].replace("/*", "/") + + result_df["crdc_instance_uuid"] + + ".dcm" + ) + url_column = "instance_url" + else: + url_column = "series_aws_url" + if use_s5cmd_sync and len(os.listdir(downloadDir)) != 0: if dirTemplate is not None: result_df["s5cmd_cmd"] = ( - "sync " - + result_df["series_aws_url"] - + ' "' - + result_df["path"] - + '"' + "sync " + result_df[url_column] + ' "' + result_df["path"] + '"' ) else: result_df["s5cmd_cmd"] = ( - "sync " + result_df["series_aws_url"] + ' "' + downloadDir + '"' + "sync " + result_df[url_column] + ' "' + downloadDir + '"' ) elif dirTemplate is not None: result_df["s5cmd_cmd"] = ( - "cp " + result_df["series_aws_url"] + ' "' + result_df["path"] + '"' + "cp " + result_df[url_column] + ' "' + result_df["path"] + '"' ) else: result_df["s5cmd_cmd"] = ( - "cp " + result_df["series_aws_url"] + ' "' + downloadDir + '"' + "cp " + result_df[url_column] + ' "' + downloadDir + '"' ) # Combine all commands into a single string with newline separators commands = "\n".join(result_df["s5cmd_cmd"]) - manifest_file.write(commands) if dirTemplate is not None: @@ -1523,6 +1705,44 @@ def download_from_selection( list_of_directories=list_of_directories, ) + def download_dicom_instance( + self, + sopInstanceUID, + downloadDir, + dry_run=False, + quiet=True, + show_progress_bar=True, + use_s5cmd_sync=False, + dirTemplate=DOWNLOAD_HIERARCHY_DEFAULT, + ) -> None: + """ + Download the files corresponding to the seriesInstanceUID to the specified directory. + + Args: + sopInstanceUID: string or list of strings containing the values of DICOM SOPInstanceUID to filter by + downloadDir: string containing the path to the directory to download the files to + dry_run: calculates the size of the cohort but download does not start + quiet (bool): If True, suppresses the output of the subprocess. Defaults to True. + show_progress_bar (bool): If True, tracks the progress of download + use_s5cmd_sync (bool): If True, will use s5cmd sync operation instead of cp when downloadDirectory is not empty; this can significantly improve the download speed if the content is partially downloaded + dirTemplate (str): Download directory hierarchy template. This variable defines the folder hierarchy for the organizing the downloaded files in downloadDirectory. Defaults to index.DOWNLOAD_HIERARCHY_DEFAULT set to %collection_id/%PatientID/%StudyInstanceUID/%Modality_%SeriesInstanceUID. The template string can be built using a combination of selected metadata attributes (PatientID, collection_id, Modality, StudyInstanceUID, SeriesInstanceUID) that must be prefixed by '%'. The following special characters can be used as separators: '-' (hyphen), '/' (slash for subdirectories), '_' (underscore). When set to None all files will be downloaded to the download directory with no subdirectories. + + Returns: None + + Raises: + TypeError: If sopInstanceUID(s) passed is(are) not a string or list + + """ + self.download_from_selection( + downloadDir, + sopInstanceUID=sopInstanceUID, + dry_run=dry_run, + quiet=quiet, + show_progress_bar=show_progress_bar, + use_s5cmd_sync=use_s5cmd_sync, + dirTemplate=dirTemplate, + ) + def download_dicom_series( self, seriesInstanceUID, @@ -1689,4 +1909,11 @@ def sql_query(self, sql_query): """ index = self.index + # TODO: find a more elegant way to automate the following + if hasattr(self, "sm_index"): + sm_index = self.sm_index + if hasattr(self, "sm_instance_index"): + sm_instance_index = self.sm_instance_index + if hasattr(self, "clinical_index"): + clinical_index = self.clinical_index return duckdb.query(sql_query).to_df() diff --git a/pyproject.toml b/pyproject.toml index 92304920..f48c9ad0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -34,7 +34,7 @@ dynamic = ["version"] dependencies = [ "click", 'duckdb>=0.10.0', - "idc-index-data==18.1.0", + "idc-index-data==19.0.1", "packaging", "pandas<2.2", "psutil", diff --git a/tests/idcindex.py b/tests/idcindex.py index 168ac1f0..4038afdb 100644 --- a/tests/idcindex.py +++ b/tests/idcindex.py @@ -9,6 +9,7 @@ import pandas as pd import pytest +import requests from click.testing import CliRunner from idc_index import IDCClient, cli @@ -18,6 +19,17 @@ logging.basicConfig(level=logging.DEBUG) +def remote_file_exists(url): + try: + response = requests.head(url, allow_redirects=True) + # Check if the status code indicates success + return response.status_code == 200 + except requests.RequestException as e: + # Handle any exceptions (e.g., network issues) + print(f"An error occurred: {e}") + return False + + @pytest.fixture(autouse=True) def _change_test_dir(request, monkeypatch): monkeypatch.chdir(request.fspath.dirname) @@ -174,6 +186,17 @@ def test_download_dicom_series(self): ) self.assertEqual(sum([len(files) for r, d, files in os.walk(temp_dir)]), 3) + def test_download_dicom_instance(self): + i = IDCClient() + i.fetch_index("sm_instance_index") + with tempfile.TemporaryDirectory() as temp_dir: + self.client.download_dicom_instance( + sopInstanceUID="1.3.6.1.4.1.5962.99.1.528744472.1087975700.1641206284312.14.0", + downloadDir=temp_dir, + ) + + self.assertEqual(sum([len(files) for r, d, files in os.walk(temp_dir)]), 1) + def test_download_with_template(self): dirTemplateValues = [ None, @@ -481,11 +504,7 @@ def test_prior_version_manifest(self): # is fully resolved, the manifest below should not be empty, and this test should be updated # with count equal to 5 with open(temp_manifest_file) as file: - assert len(file.readlines()) == 0 - - def test_list_indices(self): - i = IDCClient() - assert i.indices_overview # assert that dict was created + assert len(file.readlines()) == 5 def test_fetch_index(self): i = IDCClient() @@ -494,6 +513,12 @@ def test_fetch_index(self): assert i.indices_overview["sm_index"]["installed"] is True assert hasattr(i, "sm_index") + def test_indices_urls(self): + i = IDCClient() + for index in i.indices_overview: + if i.indices_overview[index]["url"] is not None: + assert remote_file_exists(i.indices_overview[index]["url"]) + if __name__ == "__main__": unittest.main() From 5a9093b18b632b3b732f48d184da4d71be6dba50 Mon Sep 17 00:00:00 2001 From: ds_93 Date: Thu, 19 Sep 2024 17:51:57 +0200 Subject: [PATCH 02/13] Revert "fixed error" This reverts commit eda4f82d25e04319bc2ae70ab58d0038941d6474. --- idc_index/index.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/idc_index/index.py b/idc_index/index.py index fbbe7694..e70aac38 100644 --- a/idc_index/index.py +++ b/idc_index/index.py @@ -1593,8 +1593,22 @@ def download_from_selection( total_size = round(result_df["series_size_MB"].sum(), 2) else: total_size_bytes = round(result_df["instance_size"].sum(), 2) - total_size = total_size_bytes / (10**6) # in MB - logger.info("Total size of files to download: " + self._format_size(total_size)) + logger.info( + "Total size of files to download: " + + self._format_size(total_size_bytes, size_in_bytes=True) + ) + + # disk_free_space_MB = psutil.disk_usage(downloadDir).free / (1000 * 1000) + # if disk_free_space_MB < total_size: + # logger.error("Not enough free space on disk to download the files.") + # logger.error( + # "Total size of files to download: " + self._format_size(total_size) + # ) + # logger.error( + # "Total free space on disk: " + self._format_size(disk_free_space_MB) + # ) + # return + logger.info( "Total free space on disk: " + str(psutil.disk_usage(downloadDir).free / (1000 * 1000 * 1000)) From 585c8592aa5e6226d8b0fd75168b3f575e5970ef Mon Sep 17 00:00:00 2001 From: ds_93 Date: Fri, 20 Sep 2024 11:16:07 +0200 Subject: [PATCH 03/13] Revert "removed hardcoded idc-index-data version" This reverts commit 111c01c01ac37ed9943ea5ecf762ebbcffb96a74. --- idc_index/index.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/idc_index/index.py b/idc_index/index.py index e70aac38..5848486d 100644 --- a/idc_index/index.py +++ b/idc_index/index.py @@ -22,6 +22,11 @@ aws_endpoint_url = "https://s3.amazonaws.com" gcp_endpoint_url = "https://storage.googleapis.com" asset_endpoint_url = f"https://github.com/ImagingDataCommons/idc-index-data/releases/download/{idc_index_data.__version__}" +# TODO: remove later +asset_endpoint_url = ( + "https://github.com/ImagingDataCommons/idc-index-data/releases/download/18.2.0" +) + logging.basicConfig(format="%(asctime)s - %(message)s", level=logging.INFO) logger = logging.getLogger(__name__) From 8b25255e82c1ac0260f5985f994a3f56a1f21174 Mon Sep 17 00:00:00 2001 From: ds_93 Date: Fri, 20 Sep 2024 11:20:37 +0200 Subject: [PATCH 04/13] Revert "Revert "removed hardcoded idc-index-data version"" This reverts commit 585c8592aa5e6226d8b0fd75168b3f575e5970ef. --- idc_index/index.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/idc_index/index.py b/idc_index/index.py index 5848486d..e70aac38 100644 --- a/idc_index/index.py +++ b/idc_index/index.py @@ -22,11 +22,6 @@ aws_endpoint_url = "https://s3.amazonaws.com" gcp_endpoint_url = "https://storage.googleapis.com" asset_endpoint_url = f"https://github.com/ImagingDataCommons/idc-index-data/releases/download/{idc_index_data.__version__}" -# TODO: remove later -asset_endpoint_url = ( - "https://github.com/ImagingDataCommons/idc-index-data/releases/download/18.2.0" -) - logging.basicConfig(format="%(asctime)s - %(message)s", level=logging.INFO) logger = logging.getLogger(__name__) From 8cf5722dfe3f1c70cc9ff689f28aaa6a3716846b Mon Sep 17 00:00:00 2001 From: ds_93 Date: Fri, 20 Sep 2024 11:26:22 +0200 Subject: [PATCH 05/13] re-done merging of format_size functions --- idc_index/index.py | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/idc_index/index.py b/idc_index/index.py index e70aac38..04a733a6 100644 --- a/idc_index/index.py +++ b/idc_index/index.py @@ -1001,7 +1001,7 @@ def _track_download_progress( logger.info( "Initial size of the directory: %s", - IDCClient._format_size_bytes(initial_size_bytes), + IDCClient._format_size(initial_size_bytes, size_in_bytes=True), ) logger.info( "Approximate size of the files that need to be downloaded: %s", @@ -1333,26 +1333,21 @@ def _s5cmd_run( logger.info("Successfully downloaded files to %s", str(downloadDir)) @staticmethod - def _format_size(size_MB): + def _format_size(size, size_in_bytes: bool = False): + if size_in_bytes: + size_MB = size / (10**6) + else: + size_MB = size size_GB = size_MB / 1000 size_TB = size_GB / 1000 if size_TB >= 1: return f"{round(size_TB, 2)} TB" - if size_GB >= 1: - return f"{round(size_GB, 2)} GB" - return f"{round(size_MB, 2)} MB" - - @staticmethod - def _format_size_bytes(size_bytes): - size_MB = size_bytes / (10**6) - size_GB = size_MB / 1000 - if size_GB >= 1: return f"{round(size_GB, 2)} GB" if size_MB >= 1: return f"{round(size_MB, 2)} MB" - return f"{round(size_bytes, 2)} bytes" + return f"{round(size, 2)} bytes" def download_from_manifest( self, From a1f272dd53e573a824797fcad84a65a2829e5edc Mon Sep 17 00:00:00 2001 From: ds_93 Date: Fri, 20 Sep 2024 11:40:16 +0200 Subject: [PATCH 06/13] fixed variable used before assignment --- idc_index/index.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/idc_index/index.py b/idc_index/index.py index 43dd3f71..50820be4 100644 --- a/idc_index/index.py +++ b/idc_index/index.py @@ -1587,10 +1587,10 @@ def download_from_selection( if not sopInstanceUID: total_size = round(result_df["series_size_MB"].sum(), 2) else: - total_size_bytes = round(result_df["instance_size"].sum(), 2) + total_size = round(result_df["instance_size"].sum(), 2) logger.info( "Total size of files to download: " - + self._format_size(total_size_bytes, size_in_bytes=True) + + self._format_size(total_size, size_in_bytes=True) ) disk_free_space_MB = psutil.disk_usage(downloadDir).free / (1000 * 1000) From d49f375ace1b207aafd677e6c9c2ed90df3800a9 Mon Sep 17 00:00:00 2001 From: ds_93 Date: Fri, 20 Sep 2024 12:07:54 +0200 Subject: [PATCH 07/13] fixed false not enough free space error --- idc_index/index.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/idc_index/index.py b/idc_index/index.py index 50820be4..4e1075df 100644 --- a/idc_index/index.py +++ b/idc_index/index.py @@ -1587,11 +1587,12 @@ def download_from_selection( if not sopInstanceUID: total_size = round(result_df["series_size_MB"].sum(), 2) else: - total_size = round(result_df["instance_size"].sum(), 2) + total_size_bytes = round(result_df["instance_size"].sum(), 2) logger.info( "Total size of files to download: " - + self._format_size(total_size, size_in_bytes=True) + + self._format_size(total_size_bytes, size_in_bytes=True) ) + total_size = total_size_bytes / (10**6) disk_free_space_MB = psutil.disk_usage(downloadDir).free / (1000 * 1000) if disk_free_space_MB < total_size: From bbef810a398a6cb9923997daca37ffb318d82c07 Mon Sep 17 00:00:00 2001 From: ds_93 Date: Mon, 30 Sep 2024 12:00:08 +0200 Subject: [PATCH 08/13] removed merge with main index when adding new index --- idc_index/index.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/idc_index/index.py b/idc_index/index.py index 4e1075df..5a0b7b35 100644 --- a/idc_index/index.py +++ b/idc_index/index.py @@ -263,13 +263,8 @@ def fetch_index(self, index) -> None: with open(filepath, mode="wb") as file: file.write(response.content) - # Join new index with main index - sm_instance_index = pd.read_parquet(filepath) - sm_instance_index = sm_instance_index.merge( - self.index, on="SeriesInstanceUID", how="left" - ) - - setattr(self.__class__, index, sm_instance_index) + index_table = pd.read_parquet(filepath) + setattr(self.__class__, index, index_table) self.indices_overview[index]["installed"] = True else: From e93bad2933f135e6bb5771b62b2733f1ab7206d4 Mon Sep 17 00:00:00 2001 From: ds_93 Date: Mon, 30 Sep 2024 12:47:54 +0200 Subject: [PATCH 09/13] modified sql query in download_from_selection in case of instance download to be able to catch attributes necessary for hierarchy template --- idc_index/index.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/idc_index/index.py b/idc_index/index.py index 5a0b7b35..8623e785 100644 --- a/idc_index/index.py +++ b/idc_index/index.py @@ -264,6 +264,10 @@ def fetch_index(self, index) -> None: file.write(response.content) index_table = pd.read_parquet(filepath) + # index_table = index_table.merge( + # self.index[["series_aws_url", "SeriesInstanceUID"]], + # on="SeriesInstanceUID", how="left" + # ) setattr(self.__class__, index, index_table) self.indices_overview[index]["installed"] = True @@ -1617,6 +1621,7 @@ def download_from_selection( downloadDir=downloadDir, dirTemplate=dirTemplate, ) + if sopInstanceUID: sql = f""" WITH temp as @@ -1633,6 +1638,8 @@ def download_from_selection( temp JOIN sm_instance_index using (sopInstanceUID) + JOIN + index using (seriesInstanceUID) """ else: sql = f""" From 86bb42d8297a2a6e53c303b0c8fd5bfc34b78a7d Mon Sep 17 00:00:00 2001 From: ds_93 Date: Tue, 1 Oct 2024 11:29:38 +0200 Subject: [PATCH 10/13] trim trailing whitespaces fix --- idc_index/index.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/idc_index/index.py b/idc_index/index.py index ee987c85..f87ea432 100644 --- a/idc_index/index.py +++ b/idc_index/index.py @@ -1594,7 +1594,7 @@ def download_from_selection( ) else: hierarchy = f"CONCAT('{downloadDir}')" - + if sopInstanceUID: sql = f""" WITH temp as From 2f81497acf1d5385488f3f7c84b048ffa0221c4d Mon Sep 17 00:00:00 2001 From: ds_93 Date: Tue, 1 Oct 2024 11:39:21 +0200 Subject: [PATCH 11/13] added index_crdc_series_uuid --- idc_index/index.py | 1 + 1 file changed, 1 insertion(+) diff --git a/idc_index/index.py b/idc_index/index.py index f87ea432..397f93b3 100644 --- a/idc_index/index.py +++ b/idc_index/index.py @@ -1607,6 +1607,7 @@ def download_from_selection( SELECT series_aws_url, CONCAT(TRIM('*' FROM series_aws_url), crdc_instance_uuid, '.dcm') as instance_url, + REGEXP_EXTRACT(series_aws_url, '(?:.*?\\/){{3}}([^\\/?#]+)', 1) index_crdc_series_uuid, series_size_MB, {hierarchy} as path FROM From ef0101d0a002be8d30ad4f40b6b7ba6dc03a516b Mon Sep 17 00:00:00 2001 From: Andrey Fedorov Date: Mon, 7 Oct 2024 15:34:41 -0400 Subject: [PATCH 12/13] BUG: clean up after conflict resolution --- idc_index/index.py | 12 +++++------- tests/idcindex.py | 3 +++ 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/idc_index/index.py b/idc_index/index.py index 0d113bc1..c6d0ea5b 100644 --- a/idc_index/index.py +++ b/idc_index/index.py @@ -153,7 +153,7 @@ def _safe_filter_by_selection( patientId, studyInstanceUID, seriesInstanceUID, - sopInstanceUID=None, + sopInstanceUID, crdc_series_uuid, ): if collection_id is not None: @@ -205,10 +205,10 @@ def _safe_filter_by_selection( if sopInstanceUID is not None: result_df = IDCClient._filter_by_dicom_instance_uid( - result_df, sopInstanceUID + df_index, sopInstanceUID ) return result_df - + if seriesInstanceUID is not None: result_df = IDCClient._filter_by_dicom_series_uid( df_index, seriesInstanceUID @@ -1512,6 +1512,7 @@ def citations_from_selection( patientId=patientId, studyInstanceUID=studyInstanceUID, seriesInstanceUID=seriesInstanceUID, + sopInstanceUID=None, crdc_series_uuid=None, ) @@ -1604,10 +1605,7 @@ def download_from_selection( raise ValueError( "Instance-level access not possible because instance-level index not installed." ) - else: - download_df = self.index - - if crdc_series_uuid is not None: + elif crdc_series_uuid is not None: download_df = pd.concat( [ self.index[ diff --git a/tests/idcindex.py b/tests/idcindex.py index 18c8311a..66b875b9 100644 --- a/tests/idcindex.py +++ b/tests/idcindex.py @@ -15,6 +15,9 @@ # Run tests using the following command from the root of the repository: # python -m unittest -vv tests/idcindex.py +# +# run specific tests with this: +# pytest ./tests/idcindex.py::TestIDCClient.test_download_dicom_instance logging.basicConfig(level=logging.DEBUG) From 5af1112dd767419358d2ee70e1222b58cdaede0d Mon Sep 17 00:00:00 2001 From: Andrey Fedorov Date: Tue, 8 Oct 2024 10:27:55 -0400 Subject: [PATCH 13/13] ENH: update to disable sync for instance-level download We do not currently support instance-level manifests as input to the command-line download tool. We also do not provide any mechanism to generate instance-level manifests from the portal. Sync operation is using series level size for estimating progress. Implementing support for syncing instance-level manifests will take work, which would not be justified: for now, I think it is safe to assume instance-level download will only be invoked by passing SOPInstanceUID to the functions/download tool, and with just a few instances/files, syncing those instead of copy won't bring much benefit. --- idc_index/index.py | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/idc_index/index.py b/idc_index/index.py index c6d0ea5b..f6f85385 100644 --- a/idc_index/index.py +++ b/idc_index/index.py @@ -1605,6 +1605,11 @@ def download_from_selection( raise ValueError( "Instance-level access not possible because instance-level index not installed." ) + if use_s5cmd_sync: + logger.warning( + "s5cmd sync is not supported for downloading individual files. Disabling sync." + ) + use_s5cmd_sync = False elif crdc_series_uuid is not None: download_df = pd.concat( [ @@ -1699,9 +1704,8 @@ def download_from_selection( ) SELECT series_aws_url, - CONCAT(TRIM('*' FROM series_aws_url), crdc_instance_uuid, '.dcm') as instance_url, + CONCAT(TRIM('*' FROM series_aws_url), crdc_instance_uuid, '.dcm') as instance_aws_url, REGEXP_EXTRACT(series_aws_url, '(?:.*?\\/){{3}}([^\\/?#]+)', 1) index_crdc_series_uuid, - series_size_MB, {hierarchy} as path FROM temp @@ -1735,13 +1739,13 @@ def download_from_selection( with tempfile.NamedTemporaryFile(mode="w", delete=False) as manifest_file: # Determine column containing the URL for instance / series-level access if sopInstanceUID: - if not "instance_url" in result_df: - result_df["instance_url"] = ( + if not "instance_aws_url" in result_df: + result_df["instance_aws_url"] = ( result_df["series_aws_url"].replace("/*", "/") + result_df["crdc_instance_uuid"] + ".dcm" ) - url_column = "instance_url" + url_column = "instance_aws_url" else: url_column = "series_aws_url" @@ -1776,6 +1780,12 @@ def download_from_selection( Temporary download manifest is generated and is passed to self._s5cmd_run """ ) + if sopInstanceUID: + s5cmd_sync_helper_df = None + else: + s5cmd_sync_helper_df = result_df[ + ["index_crdc_series_uuid", "s5cmd_cmd", "series_size_MB", "path"] + ] self._s5cmd_run( endpoint_to_use=aws_endpoint_url, manifest_file=Path(manifest_file.name), @@ -1786,9 +1796,7 @@ def download_from_selection( use_s5cmd_sync=use_s5cmd_sync, dirTemplate=dirTemplate, list_of_directories=list_of_directories, - s5cmd_sync_helper_df=result_df[ - ["index_crdc_series_uuid", "s5cmd_cmd", "series_size_MB", "path"] - ], + s5cmd_sync_helper_df=s5cmd_sync_helper_df, ) def download_dicom_instance(