Skip to content

Commit 90efa80

Browse files
authored
Dynamically allocate volume size (#3489)
Take into account the input and auxiliary data size when assigning the volume size. Also sets the memory limit explicitly and updates SageMaker Shim. See DIAGNijmegen/rse-grand-challenge-admin#309 See DIAGNijmegen/rse-grand-challenge-admin#306
1 parent d4a298a commit 90efa80

File tree

7 files changed

+87
-10
lines changed

7 files changed

+87
-10
lines changed

app/config/settings.py

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1060,12 +1060,6 @@ def sentry_before_send(event, hint):
10601060
COMPONENTS_AMAZON_SAGEMAKER_SUBNETS = os.environ.get(
10611061
"COMPONENTS_AMAZON_SAGEMAKER_SUBNETS", ""
10621062
).split(",")
1063-
# This was 30 to match SageMaker Batch Inference but more is
1064-
# required for ground truths
1065-
# TODO Make this dynamic https://github.com/DIAGNijmegen/rse-grand-challenge-admin/issues/309
1066-
COMPONENTS_AMAZON_SAGEMAKER_VOLUME_SIZE_GB = int(
1067-
os.environ.get("COMPONENTS_AMAZON_SAGEMAKER_VOLUME_SIZE_GB", "50")
1068-
)
10691063
COMPONENTS_S3_ENDPOINT_URL = os.environ.get(
10701064
"COMPONENTS_S3_ENDPOINT_URL", AWS_S3_ENDPOINT_URL
10711065
)

app/grandchallenge/components/backends/amazon_sagemaker_base.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ class InstanceType(NamedTuple):
4949
usd_cents_per_hour: int
5050
gpus: int = 0
5151
gpu_type: GPUTypeChoices | None = None
52+
nvme_volume_size: int | None = None
5253

5354

5455
INSTANCE_OPTIONS = [
@@ -211,6 +212,7 @@ class InstanceType(NamedTuple):
211212
usd_cents_per_hour=4071,
212213
gpus=8,
213214
gpu_type=GPUTypeChoices.A100,
215+
nvme_volume_size=8 * 1000,
214216
),
215217
InstanceType(
216218
name="ml.p3.2xlarge",
@@ -275,6 +277,7 @@ class InstanceType(NamedTuple):
275277
usd_cents_per_hour=157,
276278
gpus=1,
277279
gpu_type=GPUTypeChoices.A10G,
280+
nvme_volume_size=250,
278281
),
279282
InstanceType(
280283
name="ml.g5.2xlarge",
@@ -283,6 +286,7 @@ class InstanceType(NamedTuple):
283286
usd_cents_per_hour=169,
284287
gpus=1,
285288
gpu_type=GPUTypeChoices.A10G,
289+
nvme_volume_size=450,
286290
),
287291
InstanceType(
288292
name="ml.g5.4xlarge",
@@ -291,6 +295,7 @@ class InstanceType(NamedTuple):
291295
usd_cents_per_hour=227,
292296
gpus=1,
293297
gpu_type=GPUTypeChoices.A10G,
298+
nvme_volume_size=600,
294299
),
295300
InstanceType(
296301
name="ml.g5.8xlarge",
@@ -299,6 +304,7 @@ class InstanceType(NamedTuple):
299304
usd_cents_per_hour=342,
300305
gpus=1,
301306
gpu_type=GPUTypeChoices.A10G,
307+
nvme_volume_size=900,
302308
),
303309
InstanceType(
304310
name="ml.g5.12xlarge",
@@ -307,6 +313,7 @@ class InstanceType(NamedTuple):
307313
usd_cents_per_hour=791,
308314
gpus=4,
309315
gpu_type=GPUTypeChoices.A10G,
316+
nvme_volume_size=3800,
310317
),
311318
InstanceType(
312319
name="ml.g5.16xlarge",
@@ -315,6 +322,7 @@ class InstanceType(NamedTuple):
315322
usd_cents_per_hour=572,
316323
gpus=1,
317324
gpu_type=GPUTypeChoices.A10G,
325+
nvme_volume_size=1900,
318326
),
319327
InstanceType(
320328
name="ml.g5.24xlarge",
@@ -323,6 +331,7 @@ class InstanceType(NamedTuple):
323331
usd_cents_per_hour=1136,
324332
gpus=4,
325333
gpu_type=GPUTypeChoices.A10G,
334+
nvme_volume_size=3800,
326335
),
327336
InstanceType(
328337
name="ml.g5.48xlarge",
@@ -331,6 +340,7 @@ class InstanceType(NamedTuple):
331340
usd_cents_per_hour=2273,
332341
gpus=8,
333342
gpu_type=GPUTypeChoices.A10G,
343+
nvme_volume_size=2 * 3800,
334344
),
335345
InstanceType(
336346
name="ml.g4dn.xlarge",
@@ -339,6 +349,7 @@ class InstanceType(NamedTuple):
339349
usd_cents_per_hour=83,
340350
gpus=1,
341351
gpu_type=GPUTypeChoices.T4,
352+
nvme_volume_size=125,
342353
),
343354
InstanceType(
344355
name="ml.g4dn.2xlarge",
@@ -347,6 +358,7 @@ class InstanceType(NamedTuple):
347358
usd_cents_per_hour=105,
348359
gpus=1,
349360
gpu_type=GPUTypeChoices.T4,
361+
nvme_volume_size=225,
350362
),
351363
InstanceType(
352364
name="ml.g4dn.4xlarge",
@@ -355,6 +367,7 @@ class InstanceType(NamedTuple):
355367
usd_cents_per_hour=168,
356368
gpus=1,
357369
gpu_type=GPUTypeChoices.T4,
370+
nvme_volume_size=225,
358371
),
359372
InstanceType(
360373
name="ml.g4dn.8xlarge",
@@ -363,6 +376,7 @@ class InstanceType(NamedTuple):
363376
usd_cents_per_hour=304,
364377
gpus=1,
365378
gpu_type=GPUTypeChoices.T4,
379+
nvme_volume_size=900,
366380
),
367381
InstanceType(
368382
name="ml.g4dn.12xlarge",
@@ -371,6 +385,7 @@ class InstanceType(NamedTuple):
371385
usd_cents_per_hour=546,
372386
gpus=4,
373387
gpu_type=GPUTypeChoices.T4,
388+
nvme_volume_size=900,
374389
),
375390
InstanceType(
376391
name="ml.g4dn.16xlarge",
@@ -379,6 +394,7 @@ class InstanceType(NamedTuple):
379394
usd_cents_per_hour=607,
380395
gpus=1,
381396
gpu_type=GPUTypeChoices.T4,
397+
nvme_volume_size=900,
382398
),
383399
]
384400

@@ -548,6 +564,28 @@ def _instance_type(self):
548564
def usd_cents_per_hour(self):
549565
return self._instance_type.usd_cents_per_hour
550566

567+
@property
568+
def _max_memory_mb(self):
569+
# Reserve 1 GB for the system
570+
return (self._instance_type.memory - 1) * 1024
571+
572+
@property
573+
def _required_volume_size_gb(self):
574+
required_gb = super()._required_volume_size_gb
575+
576+
if (
577+
self._instance_type.nvme_volume_size
578+
and required_gb > self._instance_type.nvme_volume_size
579+
):
580+
logger.error(
581+
f"Job {self._job_id} likely needs {required_gb} GB but "
582+
f"instance only has {self._instance_type.nvme_volume_size} GB. "
583+
"Attempting to run the job anyway."
584+
)
585+
return self._instance_type.nvme_volume_size
586+
else:
587+
return required_gb
588+
551589
def execute(self, *, input_civs, input_prefixes):
552590
self._create_invocation_json(
553591
input_civs=input_civs, input_prefixes=input_prefixes

app/grandchallenge/components/backends/amazon_sagemaker_training.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ def _create_job_boto(self):
6060
},
6161
ResourceConfig={
6262
# https://docs.aws.amazon.com/sagemaker/latest/APIReference/API_ResourceConfig.html
63-
"VolumeSizeInGB": settings.COMPONENTS_AMAZON_SAGEMAKER_VOLUME_SIZE_GB,
63+
"VolumeSizeInGB": self._required_volume_size_gb,
6464
"InstanceType": self._instance_type.name,
6565
"InstanceCount": 1,
6666
},

app/grandchallenge/components/backends/base.py

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
from django.core.exceptions import SuspiciousFileOperation, ValidationError
1717
from django.db import transaction
1818
from django.utils._os import safe_join
19+
from django.utils.functional import cached_property
1920
from panimg.image_builders import image_builder_mhd, image_builder_tiff
2021

2122
from grandchallenge.cases.tasks import import_images
@@ -148,6 +149,9 @@ def invocation_environment(self):
148149
"no_proxy": "amazonaws.com",
149150
"GRAND_CHALLENGE_COMPONENT_WRITABLE_DIRECTORIES": "/opt/ml/output/data:/opt/ml/model:/opt/ml/input/data/ground_truth/:opt/ml/checkpoints:/tmp",
150151
"GRAND_CHALLENGE_COMPONENT_POST_CLEAN_DIRECTORIES": "/opt/ml/output/data:/opt/ml/model:/opt/ml/input/data/ground_truth/",
152+
"GRAND_CHALLENGE_COMPONENT_MAX_MEMORY_MB": str(
153+
self._max_memory_mb
154+
),
151155
}
152156
if self._algorithm_model:
153157
env["GRAND_CHALLENGE_COMPONENT_MODEL"] = (
@@ -159,6 +163,10 @@ def invocation_environment(self):
159163
)
160164
return env
161165

166+
@property
167+
def _max_memory_mb(self):
168+
return self._memory_limit * 1024
169+
162170
@property
163171
def compute_cost_euro_millicents(self):
164172
duration = self.duration
@@ -207,6 +215,41 @@ def _algorithm_model_key(self):
207215
def _ground_truth_key(self):
208216
return safe_join(self._auxiliary_data_prefix, "ground-truth.tar.gz")
209217

218+
@property
219+
def _required_volume_size_gb(self):
220+
return max(
221+
# Factor 2 for decompression and making copies
222+
ceil(2 * self._input_size_bytes / settings.GIGABYTE),
223+
# Or match what was provided with Batch Inference
224+
30,
225+
)
226+
227+
@cached_property
228+
def _input_size_bytes(self):
229+
inputs_size_bytes = self._get_input_prefix_size_bytes(
230+
prefix=self._io_prefix
231+
)
232+
auxiliary_size_bytes = self._get_input_prefix_size_bytes(
233+
prefix=self._auxiliary_data_prefix
234+
)
235+
236+
return inputs_size_bytes + auxiliary_size_bytes
237+
238+
def _get_input_prefix_size_bytes(self, *, prefix):
239+
paginator = self._s3_client.get_paginator("list_objects_v2")
240+
pages = paginator.paginate(
241+
Bucket=settings.COMPONENTS_INPUT_BUCKET_NAME, Prefix=prefix
242+
)
243+
244+
total_size = 0
245+
246+
for page in pages:
247+
if "Contents" in page:
248+
for obj in page["Contents"]:
249+
total_size += obj["Size"]
250+
251+
return total_size
252+
210253
def _get_key_and_relative_path(self, *, civ, input_prefixes):
211254
if str(civ.pk) in input_prefixes:
212255
key = safe_join(

app/tests/components_tests/test_amazon_sagemaker_training_backend.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -187,7 +187,7 @@ def test_execute(settings):
187187
"S3OutputPath": f"s3://grand-challenge-components-outputs//training-outputs/algorithms/job/{pk}"
188188
},
189189
"ResourceConfig": {
190-
"VolumeSizeInGB": 50,
190+
"VolumeSizeInGB": 30,
191191
"InstanceType": "ml.m5.large",
192192
"InstanceCount": 1,
193193
},
@@ -198,6 +198,7 @@ def test_execute(settings):
198198
"no_proxy": "amazonaws.com",
199199
"GRAND_CHALLENGE_COMPONENT_WRITABLE_DIRECTORIES": "/opt/ml/output/data:/opt/ml/model:/opt/ml/input/data/ground_truth/:opt/ml/checkpoints:/tmp",
200200
"GRAND_CHALLENGE_COMPONENT_POST_CLEAN_DIRECTORIES": "/opt/ml/output/data:/opt/ml/model:/opt/ml/input/data/ground_truth/",
201+
"GRAND_CHALLENGE_COMPONENT_MAX_MEMORY_MB": "7168",
201202
},
202203
"VpcConfig": {
203204
"SecurityGroupIds": [

dockerfiles/web-base/Dockerfile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ RUN mkdir -p /opt/docker \
7777

7878
ENV PYTHONUNBUFFERED=1\
7979
AWS_XRAY_SDK_ENABLED=false\
80-
COMPONENTS_SAGEMAKER_SHIM_VERSION=0.3.4\
80+
COMPONENTS_SAGEMAKER_SHIM_VERSION=0.3.5\
8181
PATH="/opt/poetry/.venv/bin:/home/django/.local/bin:${PATH}"
8282

8383
RUN mkdir -p /opt/poetry /app /static /opt/sagemaker-shim \
@@ -89,7 +89,7 @@ USER django:django
8989
# Fetch and install sagemaker shim for shimming containers
9090
RUN mkdir -p /opt/sagemaker-shim \
9191
&& wget "https://github.com/DIAGNijmegen/rse-sagemaker-shim/releases/download/v${COMPONENTS_SAGEMAKER_SHIM_VERSION}/sagemaker-shim-${COMPONENTS_SAGEMAKER_SHIM_VERSION}-Linux-x86_64.tar.gz" -P /opt/sagemaker-shim/ \
92-
&& echo "efc462a6efd75140da89cd9311b53ec99f228abe84703e31544972867f44e65d /opt/sagemaker-shim/sagemaker-shim-${COMPONENTS_SAGEMAKER_SHIM_VERSION}-Linux-x86_64.tar.gz" | shasum -c - || exit 1 \
92+
&& echo "a0f64b99ffea8faed65a23bf0f52ff1f2a20900ca8bc6a3d13a2ff7eff1d7eb7 /opt/sagemaker-shim/sagemaker-shim-${COMPONENTS_SAGEMAKER_SHIM_VERSION}-Linux-x86_64.tar.gz" | shasum -c - || exit 1 \
9393
&& tar -C /opt/sagemaker-shim/ -xzvf "/opt/sagemaker-shim/sagemaker-shim-${COMPONENTS_SAGEMAKER_SHIM_VERSION}-Linux-x86_64.tar.gz" \
9494
&& rm "/opt/sagemaker-shim/sagemaker-shim-${COMPONENTS_SAGEMAKER_SHIM_VERSION}-Linux-x86_64.tar.gz"
9595

poetry.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)