From e616a2c8aff9dac2e01aa1f9b81730e55d936736 Mon Sep 17 00:00:00 2001 From: Amy Miniter <40307120+amyminiter@users.noreply.github.com> Date: Wed, 19 Nov 2025 11:55:11 +1100 Subject: [PATCH 1/5] Configure for non-preemptible vms --- scripts/generic_https_transfer.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/scripts/generic_https_transfer.py b/scripts/generic_https_transfer.py index 54c7dd71..eb977fb7 100755 --- a/scripts/generic_https_transfer.py +++ b/scripts/generic_https_transfer.py @@ -26,7 +26,8 @@ help='Use filenames defined before each url', ) @click.option('--presigned-url-file-path') -def main(presigned_url_file_path: str, filenames: bool): +@click.option('--non-preemptible-vm', is_flag=True, default=False, help='Use preemptible VMs') +def main(presigned_url_file_path: str, filenames: bool, non_preemptible_vm: bool): """ Given a list of presigned URLs, download the files and upload them to GCS. If each signed url is prefixed by a filename and a space, use the --filenames flag @@ -65,6 +66,7 @@ def main(presigned_url_file_path: str, filenames: bool): for idx, url in enumerate(presigned_urls): filename = names[idx] if names else os.path.basename(url).split('?')[0] j = batch.new_job(f'URL {idx} ({filename})') + j.spot(is_spot=non_preemptible_vm) quoted_url = quote(url) authenticate_cloud_credentials_in_job(job=j) # catch errors during the cURL From 53913dae69651d71924a21cad7bd42b841815294 Mon Sep 17 00:00:00 2001 From: Amy Miniter <40307120+amyminiter@users.noreply.github.com> Date: Wed, 19 Nov 2025 16:17:44 +1100 Subject: [PATCH 2/5] Use env_config to determine preemptible machine --- scripts/generic_https_transfer.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/scripts/generic_https_transfer.py b/scripts/generic_https_transfer.py index eb977fb7..7073dac4 100755 --- a/scripts/generic_https_transfer.py +++ b/scripts/generic_https_transfer.py @@ -26,8 +26,7 @@ help='Use filenames defined before each url', ) @click.option('--presigned-url-file-path') -@click.option('--non-preemptible-vm', is_flag=True, default=False, help='Use preemptible VMs') -def main(presigned_url_file_path: str, filenames: bool, non_preemptible_vm: bool): +def main(presigned_url_file_path: str, filenames: bool,): """ Given a list of presigned URLs, download the files and upload them to GCS. If each signed url is prefixed by a filename and a space, use the --filenames flag @@ -39,7 +38,10 @@ def main(presigned_url_file_path: str, filenames: bool, non_preemptible_vm: bool billing_project = env_config['hail']['billing_project'] dataset = env_config['workflow']['dataset'] output_prefix = env_config['workflow']['output_prefix'] + preemptible_vm = env_config['workflow'].get('preemptible_vm', False) + assert all({billing_project, cpg_driver_image, dataset, output_prefix}) + names = None with AnyPath(presigned_url_file_path).open() as file: if filenames: @@ -66,7 +68,7 @@ def main(presigned_url_file_path: str, filenames: bool, non_preemptible_vm: bool for idx, url in enumerate(presigned_urls): filename = names[idx] if names else os.path.basename(url).split('?')[0] j = batch.new_job(f'URL {idx} ({filename})') - j.spot(is_spot=non_preemptible_vm) + j.spot(is_spot=preemptible_vm) quoted_url = quote(url) authenticate_cloud_credentials_in_job(job=j) # catch errors during the cURL From c01b962cb032df5b94af4d2635ebb4925d32d086 Mon Sep 17 00:00:00 2001 From: Amy Miniter <40307120+amyminiter@users.noreply.github.com> Date: Wed, 19 Nov 2025 16:19:38 +1100 Subject: [PATCH 3/5] Use env_config to determine preemptible machine --- scripts/generic_https_transfer.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/scripts/generic_https_transfer.py b/scripts/generic_https_transfer.py index 7073dac4..8a22f018 100755 --- a/scripts/generic_https_transfer.py +++ b/scripts/generic_https_transfer.py @@ -26,7 +26,10 @@ help='Use filenames defined before each url', ) @click.option('--presigned-url-file-path') -def main(presigned_url_file_path: str, filenames: bool,): +def main( + presigned_url_file_path: str, + filenames: bool +): """ Given a list of presigned URLs, download the files and upload them to GCS. If each signed url is prefixed by a filename and a space, use the --filenames flag From 8a754fef0cae272d2547cbcab088a644f320a77b Mon Sep 17 00:00:00 2001 From: Amy Miniter <40307120+amyminiter@users.noreply.github.com> Date: Thu, 20 Nov 2025 13:47:52 +1100 Subject: [PATCH 4/5] Refactor get_config to config_retrieve, remove assert statement --- scripts/generic_https_transfer.py | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/scripts/generic_https_transfer.py b/scripts/generic_https_transfer.py index 8a22f018..241b8641 100755 --- a/scripts/generic_https_transfer.py +++ b/scripts/generic_https_transfer.py @@ -10,7 +10,7 @@ import click from cloudpathlib import AnyPath -from cpg_utils.config import get_config +from cpg_utils.config import config_retrieve from cpg_utils.hail_batch import ( authenticate_cloud_credentials_in_job, dataset_path, @@ -36,14 +36,10 @@ def main( GCP suffix in target GCP bucket is defined using analysis-runner's --output """ - env_config = get_config() - cpg_driver_image = env_config['workflow']['driver_image'] - billing_project = env_config['hail']['billing_project'] - dataset = env_config['workflow']['dataset'] - output_prefix = env_config['workflow']['output_prefix'] - preemptible_vm = env_config['workflow'].get('preemptible_vm', False) - - assert all({billing_project, cpg_driver_image, dataset, output_prefix}) + cpg_driver_image = config_retrieve(['workflow','driver_image']) + dataset = config_retrieve(['workflow','dataset']) + output_prefix = config_retrieve(['workflow','output_prefix']) + preemptible_vm = config_retrieve(['workflow','preemptible_vm'], False) names = None with AnyPath(presigned_url_file_path).open() as file: From 07e6ab5924d149fe4e94d0d2969eb7392963bcee Mon Sep 17 00:00:00 2001 From: Amy Miniter <40307120+amyminiter@users.noreply.github.com> Date: Thu, 20 Nov 2025 14:26:27 +1100 Subject: [PATCH 5/5] Refactor: make sure logic makes sense --- scripts/generic_https_transfer.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/scripts/generic_https_transfer.py b/scripts/generic_https_transfer.py index 241b8641..686eef17 100755 --- a/scripts/generic_https_transfer.py +++ b/scripts/generic_https_transfer.py @@ -26,20 +26,17 @@ help='Use filenames defined before each url', ) @click.option('--presigned-url-file-path') -def main( - presigned_url_file_path: str, - filenames: bool -): +def main(presigned_url_file_path: str, filenames: bool): """ Given a list of presigned URLs, download the files and upload them to GCS. If each signed url is prefixed by a filename and a space, use the --filenames flag GCP suffix in target GCP bucket is defined using analysis-runner's --output """ - cpg_driver_image = config_retrieve(['workflow','driver_image']) - dataset = config_retrieve(['workflow','dataset']) - output_prefix = config_retrieve(['workflow','output_prefix']) - preemptible_vm = config_retrieve(['workflow','preemptible_vm'], False) + cpg_driver_image = config_retrieve(['workflow', 'driver_image']) + dataset = config_retrieve(['workflow', 'dataset']) + output_prefix = config_retrieve(['workflow', 'output_prefix']) + preemptible_vm = config_retrieve(['workflow', 'preemptible_vm'], True) names = None with AnyPath(presigned_url_file_path).open() as file: @@ -67,7 +64,11 @@ def main( for idx, url in enumerate(presigned_urls): filename = names[idx] if names else os.path.basename(url).split('?')[0] j = batch.new_job(f'URL {idx} ({filename})') - j.spot(is_spot=preemptible_vm) + + # new_job sets is_spot automatically to True + if not preemptible_vm: + j.spot(is_spot=False) + quoted_url = quote(url) authenticate_cloud_credentials_in_job(job=j) # catch errors during the cURL