Skip to content

Commit 7b7918e

Browse files
author
Googler
committed
feat(components): Support dynamic values for boot_disk_type, boot_disk_size in preview.custom_job.utils.create_custom_training_job_from_component
Signed-off-by: Googler <nobody@google.com> PiperOrigin-RevId: 662242688
1 parent 289f64f commit 7b7918e

File tree

3 files changed

+49
-29
lines changed

3 files changed

+49
-29
lines changed

components/google-cloud/RELEASE.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
## Upcoming release
22
* Fix to model batch explanation component for Structured Data pipelines; image bump.
3+
* Add dynamic support for boot_disk_type, boot_disk_size in `preview.custom_job.utils.create_custom_training_job_from_component`.
34

45
## Release 2.16.0
56
* Updated the Starry Net pipeline's template gallery description, and added dataprep_nan_threshold and dataprep_zero_threshold args to the Starry Net pipeline.

components/google-cloud/google_cloud_pipeline_components/container/preview/custom_job/remote_runner.py

Lines changed: 24 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -32,23 +32,32 @@ def insert_system_labels_into_payload(payload):
3232
return json.dumps(job_spec)
3333

3434

35-
def cast_accelerator_count_to_int(payload):
36-
"""Casts accelerator_count from string to an int."""
35+
def is_json(test_string: str) -> bool:
36+
try:
37+
json.loads(test_string)
38+
except ValueError:
39+
return False
40+
return True
41+
42+
43+
def parse_nested_json_strings(payload):
44+
"""Parse nested json strings in the payload."""
3745

3846
job_spec = json.loads(payload)
39-
# TODO(b/353577594): accelerator_count placeholder is not resolved to int.
40-
# Need to typecast to int to avoid type mismatch error. Can remove when fix
41-
# placeholder resolution.
42-
if (
43-
'accelerator_count'
44-
in job_spec['job_spec']['worker_pool_specs'][0]['machine_spec']
47+
# TODO(b/353577594): Nested placeholder fields inside worker_pool_specs are
48+
# not parsed correctly in backend. Can remove when fix backend logic.
49+
worker_pool_spec = job_spec['job_spec']['worker_pool_specs'][0]
50+
if is_json(
51+
worker_pool_spec.get('machine_spec', {}).get('accelerator_count', '')
52+
):
53+
worker_pool_spec['machine_spec']['accelerator_count'] = json.loads(
54+
worker_pool_spec['machine_spec']['accelerator_count']
55+
)
56+
if is_json(
57+
worker_pool_spec.get('disk_spec', {}).get('boot_disk_size_gb', '')
4558
):
46-
job_spec['job_spec']['worker_pool_specs'][0]['machine_spec'][
47-
'accelerator_count'
48-
] = int(
49-
job_spec['job_spec']['worker_pool_specs'][0]['machine_spec'][
50-
'accelerator_count'
51-
]
59+
worker_pool_spec['disk_spec']['boot_disk_size_gb'] = json.loads(
60+
worker_pool_spec['disk_spec']['boot_disk_size_gb']
5261
)
5362
return json.dumps(job_spec)
5463

@@ -107,7 +116,7 @@ def create_custom_job(
107116
# Create custom job if it does not exist
108117
job_name = remote_runner.check_if_job_exists()
109118
if job_name is None:
110-
payload = cast_accelerator_count_to_int(payload)
119+
payload = parse_nested_json_strings(payload)
111120
job_name = remote_runner.create_job(
112121
create_custom_job_with_client,
113122
insert_system_labels_into_payload(payload),

components/google-cloud/google_cloud_pipeline_components/preview/custom_job/utils.py

Lines changed: 24 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -84,8 +84,8 @@ def create_custom_training_job_from_component(
8484
machine_type: The type of the machine to run the CustomJob. The default value is "n1-standard-4". See [more information](https://cloud.google.com/vertex-ai/docs/training/configure-compute#machine-types).
8585
accelerator_type: The type of accelerator(s) that may be attached to the machine per `accelerator_count`. See [more information](https://cloud.google.com/vertex-ai/docs/reference/rest/v1/MachineSpec#acceleratortype).
8686
accelerator_count: The number of accelerators to attach to the machine. Defaults to 1 if `accelerator_type` is set statically.
87-
boot_disk_type: Type of the boot disk (default is "pd-ssd"). Valid values: "pd-ssd" (Persistent Disk Solid State Drive) or "pd-standard" (Persistent Disk Hard Disk Drive). boot_disk_type is set as a static value and cannot be changed as a pipeline parameter.
88-
boot_disk_size_gb: Size in GB of the boot disk (default is 100GB). `boot_disk_size_gb` is set as a static value and cannot be changed as a pipeline parameter.
87+
boot_disk_type: Type of the boot disk (default is "pd-ssd"). Valid values: "pd-ssd" (Persistent Disk Solid State Drive) or "pd-standard" (Persistent Disk Hard Disk Drive).
88+
boot_disk_size_gb: Size in GB of the boot disk (default is 100GB).
8989
timeout: The maximum job running time. The default is 7 days. A duration in seconds with up to nine fractional digits, terminated by 's', for example: "3.5s".
9090
restart_job_on_worker_restart: Restarts the entire CustomJob if a worker gets restarted. This feature can be used by distributed training jobs that are not resilient to workers leaving and joining a job.
9191
service_account: Sets the default service account for workload run-as account. The [service account](https://cloud.google.com/vertex-ai/docs/pipelines/configure-project#service-account) running the pipeline submitting jobs must have act-as permission on this run-as account. If unspecified, the Vertex AI Custom Code [Service Agent](https://cloud.google.com/vertex-ai/docs/general/access-control#service-agents) for the CustomJob's project.
@@ -94,11 +94,11 @@ def create_custom_training_job_from_component(
9494
tensorboard: The name of a Vertex AI TensorBoard resource to which this CustomJob will upload TensorBoard logs.
9595
enable_web_access: Whether you want Vertex AI to enable [interactive shell access](https://cloud.google.com/vertex-ai/docs/training/monitor-debug-interactive-shell) to training containers. If `True`, you can access interactive shells at the URIs given by [CustomJob.web_access_uris][].
9696
reserved_ip_ranges: A list of names for the reserved IP ranges under the VPC network that can be used for this job. If set, we will deploy the job within the provided IP ranges. Otherwise, the job will be deployed to any IP ranges under the provided VPC network.
97-
nfs_mounts: A list of [NfsMount](https://cloud.google.com/vertex-ai/docs/reference/rest/v1/CustomJobSpec#NfsMount) resource specs in Json dict format. For more details about mounting NFS for CustomJob, see [Mount an NFS share for custom training](https://cloud.google.com/vertex-ai/docs/training/train-nfs-share).
97+
nfs_mounts: A list of [NfsMount](https://cloud.google.com/vertex-ai/docs/reference/rest/v1/CustomJobSpec#NfsMount) resource specs in Json dict format. For more details about mounting NFS for CustomJob, see [Mount an NFS share for custom training](https://cloud.google.com/vertex-ai/docs/training/train-nfs-share). `nfs_mounts` is set as a static value and cannot be changed as a pipeline parameter.
9898
base_output_directory: The Cloud Storage location to store the output of this CustomJob or HyperparameterTuningJob. See [more information](https://cloud.google.com/vertex-ai/docs/reference/rest/v1/GcsDestination).
9999
labels: The labels with user-defined metadata to organize the CustomJob. See [more information](https://goo.gl/xmQnxf).
100100
persistent_resource_id: The ID of the PersistentResource in the same Project and Location which to run. The default value is a placeholder that will be resolved to the PipelineJob [RuntimeConfig](https://cloud.google.com/vertex-ai/docs/reference/rest/v1/projects.locations.pipelineJobs#PipelineJob.RuntimeConfig)'s persistent resource id at runtime. However, if the PipelineJob doesn't set Persistent Resource as the job level runtime, the placedholder will be resolved to an empty string and the custom job will be run on demand. If the value is set explicitly, the custom job will runs in the specified persistent resource, in this case, please note the network and CMEK configs on the job should be consistent with those on the PersistentResource, otherwise, the job will be rejected. (This is a Preview feature not yet recommended for production workloads.)
101-
env: Environment variables to be passed to the container. Takes the form `[{'name': '...', 'value': '...'}]`. Maximum limit is 100.
101+
env: Environment variables to be passed to the container. Takes the form `[{'name': '...', 'value': '...'}]`. Maximum limit is 100. `env` is set as a static value and cannot be changed as a pipeline parameter.
102102
103103
Returns:
104104
A KFP component with CustomJob specification applied.
@@ -164,12 +164,11 @@ def create_custom_training_job_from_component(
164164
),
165165
'env': env or [],
166166
},
167+
'disk_spec': {
168+
'boot_disk_type': "{{$.inputs.parameters['boot_disk_type']}}",
169+
'boot_disk_size_gb': "{{$.inputs.parameters['boot_disk_size_gb']}}",
170+
},
167171
}
168-
if boot_disk_type:
169-
worker_pool_spec['disk_spec'] = {
170-
'boot_disk_type': boot_disk_type,
171-
'boot_disk_size_gb': boot_disk_size_gb,
172-
}
173172
if nfs_mounts:
174173
worker_pool_spec['nfs_mounts'] = nfs_mounts
175174

@@ -211,10 +210,7 @@ def create_custom_training_job_from_component(
211210
'defaultValue'
212211
] = default_value
213212

214-
# add machine parameters into the customjob component
215-
if accelerator_type == 'ACCELERATOR_TYPE_UNSPECIFIED':
216-
accelerator_count = 0
217-
213+
# add workerPoolSpec parameters into the customjob component
218214
cj_component_spec['inputDefinitions']['parameters']['machine_type'] = {
219215
'parameterType': 'STRING',
220216
'defaultValue': machine_type,
@@ -227,7 +223,21 @@ def create_custom_training_job_from_component(
227223
}
228224
cj_component_spec['inputDefinitions']['parameters']['accelerator_count'] = {
229225
'parameterType': 'NUMBER_INTEGER',
230-
'defaultValue': accelerator_count,
226+
'defaultValue': (
227+
accelerator_count
228+
if accelerator_type != 'ACCELERATOR_TYPE_UNSPECIFIED'
229+
else 0
230+
),
231+
'isOptional': True,
232+
}
233+
cj_component_spec['inputDefinitions']['parameters']['boot_disk_type'] = {
234+
'parameterType': 'STRING',
235+
'defaultValue': boot_disk_type,
236+
'isOptional': True,
237+
}
238+
cj_component_spec['inputDefinitions']['parameters']['boot_disk_size_gb'] = {
239+
'parameterType': 'NUMBER_INTEGER',
240+
'defaultValue': boot_disk_size_gb,
231241
'isOptional': True,
232242
}
233243

0 commit comments

Comments
 (0)