From 831a54d1756500d0591cb72dc2ae687d43c6a8aa Mon Sep 17 00:00:00 2001 From: Amy Miniter <40307120+amyminiter@users.noreply.github.com> Date: Thu, 22 Jan 2026 15:17:31 +1100 Subject: [PATCH 01/11] Update command --- scripts/generic_https_transfer.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/scripts/generic_https_transfer.py b/scripts/generic_https_transfer.py index 686eef17..539f0e93 100755 --- a/scripts/generic_https_transfer.py +++ b/scripts/generic_https_transfer.py @@ -25,8 +25,9 @@ default=False, help='Use filenames defined before each url', ) +@click.option('--use-wget', is_flag=True, default=False, help='Use wget instead of curl') @click.option('--presigned-url-file-path') -def main(presigned_url_file_path: str, filenames: bool): +def main(presigned_url_file_path: str, filenames: bool, use_wget: bool): """ Given a list of presigned URLs, download the files and upload them to GCS. If each signed url is prefixed by a filename and a space, use the --filenames flag @@ -73,8 +74,15 @@ def main(presigned_url_file_path: str, filenames: bool): authenticate_cloud_credentials_in_job(job=j) # catch errors during the cURL j.command('set -euxo pipefail') - j.command( - f'curl -L {quoted_url} | gsutil cp - {os.path.join(output_path, filename)}', + # if not success, then wait and try again. Curl has ways to resume interrupted uploads: -C - + # Consider wGet as an alternative to curl if issues arise + if use_wget: + j.command( + f'wget -O - {quoted_url} | gsutil cp - {os.path.join(output_path, filename)}', + ) + else: + j.command( + f'curl -L {quoted_url} | gsutil cp - {os.path.join(output_path, filename)}', ) batch.run(wait=False) From 172e5b5109c27574c90ac019467f06d0667e9c77 Mon Sep 17 00:00:00 2001 From: Amy Miniter <40307120+amyminiter@users.noreply.github.com> Date: Thu, 22 Jan 2026 15:24:27 +1100 Subject: [PATCH 02/11] Fix linting --- scripts/generic_https_transfer.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/scripts/generic_https_transfer.py b/scripts/generic_https_transfer.py index 539f0e93..85879ad1 100755 --- a/scripts/generic_https_transfer.py +++ b/scripts/generic_https_transfer.py @@ -25,7 +25,12 @@ default=False, help='Use filenames defined before each url', ) -@click.option('--use-wget', is_flag=True, default=False, help='Use wget instead of curl') +@click.option( + '--use-wget', + is_flag=True, + default=False, + help='Use wget instead of curl', +) @click.option('--presigned-url-file-path') def main(presigned_url_file_path: str, filenames: bool, use_wget: bool): """ @@ -83,7 +88,7 @@ def main(presigned_url_file_path: str, filenames: bool, use_wget: bool): else: j.command( f'curl -L {quoted_url} | gsutil cp - {os.path.join(output_path, filename)}', - ) + ) batch.run(wait=False) From f1ef478346abe7688997e34337f96b25ed8b3dc5 Mon Sep 17 00:00:00 2001 From: Amy Miniter <40307120+amyminiter@users.noreply.github.com> Date: Thu, 22 Jan 2026 15:25:26 +1100 Subject: [PATCH 03/11] Remove redundant comments --- scripts/generic_https_transfer.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/scripts/generic_https_transfer.py b/scripts/generic_https_transfer.py index 85879ad1..f8de33f7 100755 --- a/scripts/generic_https_transfer.py +++ b/scripts/generic_https_transfer.py @@ -79,8 +79,7 @@ def main(presigned_url_file_path: str, filenames: bool, use_wget: bool): authenticate_cloud_credentials_in_job(job=j) # catch errors during the cURL j.command('set -euxo pipefail') - # if not success, then wait and try again. Curl has ways to resume interrupted uploads: -C - - # Consider wGet as an alternative to curl if issues arise + if use_wget: j.command( f'wget -O - {quoted_url} | gsutil cp - {os.path.join(output_path, filename)}', From 723472d8b33bc3e2121a4adc79eb5cf94579fa53 Mon Sep 17 00:00:00 2001 From: Amy Miniter <40307120+amyminiter@users.noreply.github.com> Date: Thu, 22 Jan 2026 15:28:14 +1100 Subject: [PATCH 04/11] Fix linting --- scripts/generic_https_transfer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/generic_https_transfer.py b/scripts/generic_https_transfer.py index f8de33f7..eb8f8c4b 100755 --- a/scripts/generic_https_transfer.py +++ b/scripts/generic_https_transfer.py @@ -79,7 +79,7 @@ def main(presigned_url_file_path: str, filenames: bool, use_wget: bool): authenticate_cloud_credentials_in_job(job=j) # catch errors during the cURL j.command('set -euxo pipefail') - + if use_wget: j.command( f'wget -O - {quoted_url} | gsutil cp - {os.path.join(output_path, filename)}', From 671e7d430e63754c3334b582fc7db541131be8d6 Mon Sep 17 00:00:00 2001 From: Amy Miniter <40307120+amyminiter@users.noreply.github.com> Date: Fri, 23 Jan 2026 08:27:04 +1100 Subject: [PATCH 05/11] Updated use of quote() for output and source paths --- scripts/generic_https_transfer.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/scripts/generic_https_transfer.py b/scripts/generic_https_transfer.py index eb8f8c4b..74231c1f 100755 --- a/scripts/generic_https_transfer.py +++ b/scripts/generic_https_transfer.py @@ -75,18 +75,19 @@ def main(presigned_url_file_path: str, filenames: bool, use_wget: bool): if not preemptible_vm: j.spot(is_spot=False) - quoted_url = quote(url) + quoted_source_url = quote(url) + quoted_output_path = quote(output_path) authenticate_cloud_credentials_in_job(job=j) # catch errors during the cURL j.command('set -euxo pipefail') if use_wget: j.command( - f'wget -O - {quoted_url} | gsutil cp - {os.path.join(output_path, filename)}', + f'wget -O - {quoted_source_url} | gsutil cp - {os.path.join(quoted_output_path, filename)}', ) else: j.command( - f'curl -L {quoted_url} | gsutil cp - {os.path.join(output_path, filename)}', + f'curl -L {quoted_source_url} | gsutil cp - {os.path.join(quoted_output_path, filename)}', ) batch.run(wait=False) From 450fecd84ab94be93cd618712f7ad5dec32461f7 Mon Sep 17 00:00:00 2001 From: Amy Miniter <40307120+amyminiter@users.noreply.github.com> Date: Fri, 23 Jan 2026 08:28:35 +1100 Subject: [PATCH 06/11] Understand comment and update quote() properly --- scripts/generic_https_transfer.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/scripts/generic_https_transfer.py b/scripts/generic_https_transfer.py index 74231c1f..ce89fb4d 100755 --- a/scripts/generic_https_transfer.py +++ b/scripts/generic_https_transfer.py @@ -76,18 +76,17 @@ def main(presigned_url_file_path: str, filenames: bool, use_wget: bool): j.spot(is_spot=False) quoted_source_url = quote(url) - quoted_output_path = quote(output_path) authenticate_cloud_credentials_in_job(job=j) # catch errors during the cURL j.command('set -euxo pipefail') if use_wget: j.command( - f'wget -O - {quoted_source_url} | gsutil cp - {os.path.join(quoted_output_path, filename)}', + f'wget -O - {quoted_source_url} | gsutil cp - {quote(os.path.join(output_path, filename))}', ) else: j.command( - f'curl -L {quoted_source_url} | gsutil cp - {os.path.join(quoted_output_path, filename)}', + f'curl -L {quoted_source_url} | gsutil cp - {quote(os.path.join(output_path, filename))}', ) batch.run(wait=False) From 8321c785290dcbcd337241845c24ad5873363cf3 Mon Sep 17 00:00:00 2001 From: Amy Miniter <40307120+amyminiter@users.noreply.github.com> Date: Fri, 23 Jan 2026 08:31:49 +1100 Subject: [PATCH 07/11] Add consistency for wget curl commands --- scripts/generic_https_transfer.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/scripts/generic_https_transfer.py b/scripts/generic_https_transfer.py index ce89fb4d..33f25e31 100755 --- a/scripts/generic_https_transfer.py +++ b/scripts/generic_https_transfer.py @@ -76,17 +76,18 @@ def main(presigned_url_file_path: str, filenames: bool, use_wget: bool): j.spot(is_spot=False) quoted_source_url = quote(url) + quoted_output_url = quote(os.path.join(output_path, filename)) authenticate_cloud_credentials_in_job(job=j) # catch errors during the cURL j.command('set -euxo pipefail') if use_wget: j.command( - f'wget -O - {quoted_source_url} | gsutil cp - {quote(os.path.join(output_path, filename))}', + f'wget -O - {quoted_source_url} | gsutil cp - {quoted_output_url}', ) else: j.command( - f'curl -L {quoted_source_url} | gsutil cp - {quote(os.path.join(output_path, filename))}', + f'curl -L {quoted_source_url} | gsutil cp - {quoted_output_url}', ) batch.run(wait=False) From 397b2eae5cf9cea8106e35884616b536d7608762 Mon Sep 17 00:00:00 2001 From: Amy Miniter <40307120+amyminiter@users.noreply.github.com> Date: Fri, 23 Jan 2026 08:46:55 +1100 Subject: [PATCH 08/11] Implement reviewer comments --- scripts/generic_https_transfer.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/scripts/generic_https_transfer.py b/scripts/generic_https_transfer.py index 33f25e31..1949516d 100755 --- a/scripts/generic_https_transfer.py +++ b/scripts/generic_https_transfer.py @@ -26,19 +26,18 @@ help='Use filenames defined before each url', ) @click.option( - '--use-wget', - is_flag=True, - default=False, - help='Use wget instead of curl', + '--mode', + type=click.Choice(['curl', 'wget'], case_sensitive=False), + default='curl', + help='The download tool for the file. Default is curl.git', ) @click.option('--presigned-url-file-path') -def main(presigned_url_file_path: str, filenames: bool, use_wget: bool): +def main(presigned_url_file_path: str, filenames: bool, mode: str): """ Given a list of presigned URLs, download the files and upload them to GCS. If each signed url is prefixed by a filename and a space, use the --filenames flag GCP suffix in target GCP bucket is defined using analysis-runner's --output """ - cpg_driver_image = config_retrieve(['workflow', 'driver_image']) dataset = config_retrieve(['workflow', 'dataset']) output_prefix = config_retrieve(['workflow', 'output_prefix']) @@ -81,7 +80,7 @@ def main(presigned_url_file_path: str, filenames: bool, use_wget: bool): # catch errors during the cURL j.command('set -euxo pipefail') - if use_wget: + if mode == 'wget': j.command( f'wget -O - {quoted_source_url} | gsutil cp - {quoted_output_url}', ) From 63f5642456ffe21c37559f91d9bd2b3fcbf5c709 Mon Sep 17 00:00:00 2001 From: Amy Miniter <40307120+amyminiter@users.noreply.github.com> Date: Fri, 23 Jan 2026 09:18:44 +1100 Subject: [PATCH 09/11] Replace if else block with match statement --- scripts/generic_https_transfer.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/scripts/generic_https_transfer.py b/scripts/generic_https_transfer.py index 1949516d..2644aec4 100755 --- a/scripts/generic_https_transfer.py +++ b/scripts/generic_https_transfer.py @@ -80,14 +80,17 @@ def main(presigned_url_file_path: str, filenames: bool, mode: str): # catch errors during the cURL j.command('set -euxo pipefail') - if mode == 'wget': - j.command( - f'wget -O - {quoted_source_url} | gsutil cp - {quoted_output_url}', - ) - else: - j.command( - f'curl -L {quoted_source_url} | gsutil cp - {quoted_output_url}', - ) + match mode: + case 'wget': + j.command( + f'wget -O - {quoted_source_url} | gsutil cp - {quoted_output_url}', + ) + case 'curl': + j.command( + f'curl -L {quoted_source_url} | gsutil cp - {quoted_output_url}', + ) + case _: + raise ValueError(f'invalid mode: {mode}') batch.run(wait=False) From c5016da254bc9439bbaea0bf5ea9807619a5a722 Mon Sep 17 00:00:00 2001 From: Amy Miniter <40307120+amyminiter@users.noreply.github.com> Date: Fri, 23 Jan 2026 09:19:37 +1100 Subject: [PATCH 10/11] Fix typo in comment --- scripts/generic_https_transfer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/generic_https_transfer.py b/scripts/generic_https_transfer.py index 2644aec4..f5b8165a 100755 --- a/scripts/generic_https_transfer.py +++ b/scripts/generic_https_transfer.py @@ -29,7 +29,7 @@ '--mode', type=click.Choice(['curl', 'wget'], case_sensitive=False), default='curl', - help='The download tool for the file. Default is curl.git', + help='The download tool for the file. Default is curl', ) @click.option('--presigned-url-file-path') def main(presigned_url_file_path: str, filenames: bool, mode: str): From 5d7aa299af45cb8dde6d6b40d4e9686a77e60935 Mon Sep 17 00:00:00 2001 From: Amy Miniter <40307120+amyminiter@users.noreply.github.com> Date: Fri, 23 Jan 2026 11:51:47 +1100 Subject: [PATCH 11/11] Replace gsutil with gcloud storage --- scripts/generic_https_transfer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/generic_https_transfer.py b/scripts/generic_https_transfer.py index f5b8165a..970949d5 100755 --- a/scripts/generic_https_transfer.py +++ b/scripts/generic_https_transfer.py @@ -83,11 +83,11 @@ def main(presigned_url_file_path: str, filenames: bool, mode: str): match mode: case 'wget': j.command( - f'wget -O - {quoted_source_url} | gsutil cp - {quoted_output_url}', + f'wget -O - {quoted_source_url} | gcloud storage cp - {quoted_output_url}', ) case 'curl': j.command( - f'curl -L {quoted_source_url} | gsutil cp - {quoted_output_url}', + f'curl -L {quoted_source_url} | gcloud storage cp - {quoted_output_url}', ) case _: raise ValueError(f'invalid mode: {mode}')