Skip to content

Commit 0e04540

Browse files
authored
AWS Lambda Deployment misc improvements (#407)
* Delete deployment record when it failed to create related cloud resources * Improved error message
1 parent b646ec3 commit 0e04540

File tree

10 files changed

+172
-181
lines changed

10 files changed

+172
-181
lines changed

bentoml/cli/deployment.py

Lines changed: 18 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -303,17 +303,24 @@ def create(
303303
'timeout': timeout,
304304
}
305305
yatai_service = get_yatai_service()
306-
result = create_deployment(
307-
name,
308-
namespace,
309-
bento_name,
310-
bento_version,
311-
platform,
312-
operator_spec,
313-
parse_key_value_pairs(labels),
314-
parse_key_value_pairs(annotations),
315-
yatai_service,
316-
)
306+
try:
307+
result = create_deployment(
308+
name,
309+
namespace,
310+
bento_name,
311+
bento_version,
312+
platform,
313+
operator_spec,
314+
parse_key_value_pairs(labels),
315+
parse_key_value_pairs(annotations),
316+
yatai_service,
317+
)
318+
except BentoMLException as e:
319+
_echo(
320+
'Failed to create deployment {}.: {}'.format(name, str(e)),
321+
CLI_COLOR_ERROR,
322+
)
323+
return
317324

318325
if result.status.status_code != status_pb2.Status.OK:
319326
_echo(

bentoml/deployment/aws_lambda/__init__.py

Lines changed: 12 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,6 @@
4444
from bentoml.exceptions import BentoMLException
4545
from bentoml.proto.deployment_pb2 import (
4646
ApplyDeploymentResponse,
47-
Deployment,
4847
DeploymentState,
4948
DescribeDeploymentResponse,
5049
DeleteDeploymentResponse,
@@ -307,31 +306,19 @@ def _apply(
307306
)
308307
return ApplyDeploymentResponse(status=Status.INTERNAL(str(e)))
309308

310-
logger.info('Finish deployed lambda project, fetching latest status')
311-
res_deployment_pb = Deployment(state=DeploymentState())
312-
res_deployment_pb.CopyFrom(deployment_pb)
313-
state = self.describe(res_deployment_pb, yatai_service).state
314-
res_deployment_pb.state.CopyFrom(state)
309+
deployment_pb.state.state = DeploymentState.PENDING
310+
return ApplyDeploymentResponse(status=Status.OK(), deployment=deployment_pb)
311+
except BentoMLException as error:
312+
deployment_pb.state = DeploymentState(
313+
state=DeploymentState.ERROR, error_message='Error: {}'.format(error)
314+
)
315315
return ApplyDeploymentResponse(
316-
status=Status.OK(), deployment=res_deployment_pb
316+
status=exception_to_return_status(error), deployment=deployment_pb
317317
)
318318

319-
except BentoMLException as error:
320-
return ApplyDeploymentResponse(status=exception_to_return_status(error))
321-
322319
def delete(self, deployment_pb, yatai_service):
323320
try:
324321
logger.debug('Deleting AWS Lambda deployment')
325-
describe_state_result = self.describe(deployment_pb, yatai_service).state
326-
if describe_state_result.state != DeploymentState.RUNNING:
327-
message = (
328-
'Failed to delete, no active deployment {name}. '
329-
'The current state is {state}'.format(
330-
name=deployment_pb.name,
331-
state=DeploymentState.State.Name(describe_state_result.state),
332-
)
333-
)
334-
return DeleteDeploymentResponse(status=Status.ABORTED(message))
335322

336323
deployment_spec = deployment_pb.spec
337324
lambda_deployment_config = deployment_spec.aws_lambda_operator_config
@@ -340,10 +327,11 @@ def delete(self, deployment_pb, yatai_service):
340327
stack_name = generate_aws_compatible_string(
341328
deployment_pb.namespace + '-' + deployment_pb.name
342329
)
343-
deployment_info_json = json.loads(describe_state_result.info_json)
344-
bucket_name = deployment_info_json.get('s3_bucket')
345-
if bucket_name:
346-
_cleanup_s3_bucket(bucket_name, lambda_deployment_config.region)
330+
if deployment_pb.state.info_json:
331+
deployment_info_json = json.loads(deployment_pb.state.info_json)
332+
bucket_name = deployment_info_json.get('s3_bucket')
333+
if bucket_name:
334+
_cleanup_s3_bucket(bucket_name, lambda_deployment_config.region)
347335

348336
logger.debug(
349337
'Deleting AWS CloudFormation: %s that includes Lambda function '

bentoml/deployment/aws_lambda/utils.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -33,10 +33,10 @@ def ensure_sam_available_or_raise():
3333
try:
3434
import samcli
3535

36-
if list(map(int, samcli.__version__.split('.'))) < [0, 33, 1]:
36+
if samcli.__version__ != '0.33.1':
3737
raise BentoMLException(
38-
'aws-sam-cli package requires version 0.33.1 or '
39-
'higher. Update the package with `pip install -U aws-sam-cli`'
38+
'aws-sam-cli package requires version 0.33.1 '
39+
'Install the package with `pip install -U aws-sam-cli==0.33.1`'
4040
)
4141
except ImportError:
4242
raise ImportError(
@@ -172,12 +172,12 @@ def init_sam_project(
172172
requirement_txt_path = os.path.join(bento_service_bundle_path, 'requirements.txt')
173173
shutil.copy(requirement_txt_path, function_path)
174174

175-
# Copy bundled pip dependencies
176-
logger.debug('Coping bundled_dependencies')
177175
bundled_dep_path = os.path.join(
178176
bento_service_bundle_path, 'bundled_pip_dependencies'
179177
)
180178
if os.path.isdir(bundled_dep_path):
179+
# Copy bundled pip dependencies
180+
logger.debug('Coping bundled_dependencies')
181181
shutil.copytree(
182182
bundled_dep_path, os.path.join(function_path, 'bundled_pip_dependencies')
183183
)

bentoml/deployment/utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ def ensure_docker_available_or_raise():
7070
subprocess.check_output(['docker', 'info'])
7171
except subprocess.CalledProcessError as error:
7272
raise BentoMLException(
73-
'Error executing docker command: {}'.format(error.output)
73+
'Error executing docker command: {}'.format(error.output.decode())
7474
)
7575
except not_found_error:
7676
raise BentoMLMissingDependencyException(

bentoml/yatai/python_api.py

Lines changed: 86 additions & 78 deletions
Original file line numberDiff line numberDiff line change
@@ -213,90 +213,98 @@ def create_deployment(
213213

214214
yatai_service = get_yatai_service()
215215

216-
try:
217-
# Make sure there is no active deployment with the same deployment name
218-
get_deployment_pb = yatai_service.GetDeployment(
219-
GetDeploymentRequest(deployment_name=deployment_name, namespace=namespace)
216+
# Make sure there is no active deployment with the same deployment name
217+
get_deployment_pb = yatai_service.GetDeployment(
218+
GetDeploymentRequest(deployment_name=deployment_name, namespace=namespace)
219+
)
220+
if get_deployment_pb.status.status_code == status_pb2.Status.OK:
221+
raise BentoMLDeploymentException(
222+
'Deployment "{name}" already existed, use Update or Apply for updating '
223+
'existing deployment, delete the deployment, or use a different deployment '
224+
'name'.format(name=deployment_name)
220225
)
221-
if get_deployment_pb.status.status_code == status_pb2.Status.OK:
222-
raise BentoMLDeploymentException(
223-
'Deployment "{name}" already existed, use Update or Apply for updating'
224-
'existing deployment, or create the deployment with a different name or'
225-
'under a different deployment namespace'.format(name=deployment_name)
226-
)
227-
if get_deployment_pb.status.status_code != status_pb2.Status.NOT_FOUND:
228-
raise BentoMLDeploymentException(
229-
'Failed accesing YataiService deployment store. {error_code}:'
230-
'{error_message}'.format(
231-
error_code=Status.Name(get_deployment_pb.status.status_code),
232-
error_message=get_deployment_pb.status.error_message,
233-
)
226+
if get_deployment_pb.status.status_code != status_pb2.Status.NOT_FOUND:
227+
raise BentoMLDeploymentException(
228+
'Failed accesing YataiService deployment store. {error_code}:'
229+
'{error_message}'.format(
230+
error_code=Status.Name(get_deployment_pb.status.status_code),
231+
error_message=get_deployment_pb.status.error_message,
234232
)
233+
)
235234

236-
deployment_dict = {
237-
"name": deployment_name,
238-
"namespace": namespace or config().get('deployment', 'default_namespace'),
239-
"labels": labels,
240-
"annotations": annotations,
241-
"spec": {
242-
"bento_name": bento_name,
243-
"bento_version": bento_version,
244-
"operator": platform,
245-
},
235+
deployment_dict = {
236+
"name": deployment_name,
237+
"namespace": namespace or config().get('deployment', 'default_namespace'),
238+
"labels": labels,
239+
"annotations": annotations,
240+
"spec": {
241+
"bento_name": bento_name,
242+
"bento_version": bento_version,
243+
"operator": platform,
244+
},
245+
}
246+
247+
operator = platform.replace('-', '_').upper()
248+
try:
249+
operator_value = DeploymentSpec.DeploymentOperator.Value(operator)
250+
except ValueError:
251+
return ApplyDeploymentResponse(
252+
status=Status.INVALID_ARGUMENT('Invalid platform "{}"'.format(platform))
253+
)
254+
if operator_value == DeploymentSpec.AWS_SAGEMAKER:
255+
deployment_dict['spec']['sagemaker_operator_config'] = {
256+
'region': operator_spec.get('region')
257+
or config().get('aws', 'default_region'),
258+
'instance_count': operator_spec.get('instance_count')
259+
or config().getint('sagemaker', 'default_instance_count'),
260+
'instance_type': operator_spec.get('instance_type')
261+
or config().get('sagemaker', 'default_instance_type'),
262+
'api_name': operator_spec.get('api_name', ''),
263+
}
264+
elif operator_value == DeploymentSpec.AWS_LAMBDA:
265+
deployment_dict['spec']['aws_lambda_operator_config'] = {
266+
'region': operator_spec.get('region')
267+
or config().get('aws', 'default_region')
268+
}
269+
for field in ['api_name', 'memory_size', 'timeout']:
270+
if operator_spec.get(field):
271+
deployment_dict['spec']['aws_lambda_operator_config'][
272+
field
273+
] = operator_spec[field]
274+
elif operator_value == DeploymentSpec.GCP_FCUNTION:
275+
deployment_dict['spec']['gcp_function_operatorConfig'] = {
276+
'region': operator_spec.get('region')
277+
or config().get('google-cloud', 'default_region')
246278
}
279+
if operator_spec.get('api_name'):
280+
deployment_dict['spec']['gcp_function_operator_config'][
281+
'api_name'
282+
] = operator_spec['api_name']
283+
elif operator_value == DeploymentSpec.KUBERNETES:
284+
deployment_dict['spec']['kubernetes_operator_config'] = {
285+
'kube_namespace': operator_spec.get('kube_namespace', ''),
286+
'replicas': operator_spec.get('replicas', 0),
287+
'service_name': operator_spec.get('service_name', ''),
288+
'service_type': operator_spec.get('service_type', ''),
289+
}
290+
else:
291+
raise BentoMLDeploymentException(
292+
'Platform "{}" is not supported in the current version of '
293+
'BentoML'.format(platform)
294+
)
247295

248-
operator = platform.replace('-', '_').upper()
249-
try:
250-
operator_value = DeploymentSpec.DeploymentOperator.Value(operator)
251-
except ValueError:
252-
return ApplyDeploymentResponse(
253-
status=Status.INVALID_ARGUMENT('Invalid platform "{}"'.format(platform))
254-
)
255-
if operator_value == DeploymentSpec.AWS_SAGEMAKER:
256-
deployment_dict['spec']['sagemaker_operator_config'] = {
257-
'region': operator_spec.get('region')
258-
or config().get('aws', 'default_region'),
259-
'instance_count': operator_spec.get('instance_count')
260-
or config().getint('sagemaker', 'default_instance_count'),
261-
'instance_type': operator_spec.get('instance_type')
262-
or config().get('sagemaker', 'default_instance_type'),
263-
'api_name': operator_spec.get('api_name', ''),
264-
}
265-
elif operator_value == DeploymentSpec.AWS_LAMBDA:
266-
deployment_dict['spec']['aws_lambda_operator_config'] = {
267-
'region': operator_spec.get('region')
268-
or config().get('aws', 'default_region')
269-
}
270-
for field in ['api_name', 'memory_size', 'timeout']:
271-
if operator_spec.get(field):
272-
deployment_dict['spec']['aws_lambda_operator_config'][
273-
field
274-
] = operator_spec[field]
275-
elif operator_value == DeploymentSpec.GCP_FCUNTION:
276-
deployment_dict['spec']['gcp_function_operatorConfig'] = {
277-
'region': operator_spec.get('region')
278-
or config().get('google-cloud', 'default_region')
279-
}
280-
if operator_spec.get('api_name'):
281-
deployment_dict['spec']['gcp_function_operator_config'][
282-
'api_name'
283-
] = operator_spec['api_name']
284-
elif operator_value == DeploymentSpec.KUBERNETES:
285-
deployment_dict['spec']['kubernetes_operator_config'] = {
286-
'kube_namespace': operator_spec.get('kube_namespace', ''),
287-
'replicas': operator_spec.get('replicas', 0),
288-
'service_name': operator_spec.get('service_name', ''),
289-
'service_type': operator_spec.get('service_type', ''),
290-
}
291-
else:
292-
raise BentoMLDeploymentException(
293-
'Platform "{}" is not supported in the current version of '
294-
'BentoML'.format(platform)
295-
)
296+
apply_response = apply_deployment(deployment_dict, yatai_service)
296297

297-
return apply_deployment(deployment_dict, yatai_service)
298-
except BentoMLException as error:
299-
return ApplyDeploymentResponse(status=Status.INTERNAL(str(error)))
298+
if apply_response.status.status_code == status_pb2.Status.OK:
299+
describe_response = describe_deployment(
300+
deployment_name, namespace, yatai_service
301+
)
302+
if describe_response.status.status_code == status_pb2.Status.OK:
303+
deployment_state = describe_response.state
304+
apply_response.deployment.state.CopyFrom(deployment_state)
305+
return apply_response
306+
307+
return apply_response
300308

301309

302310
# TODO update_deployment is not finished. It will be working on along with cli command

bentoml/yatai/yatai_service_impl.py

Lines changed: 27 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -117,8 +117,32 @@ def ApplyDeployment(self, request, context=None):
117117
# deploying to target platform
118118
response = operator.apply(request.deployment, self, previous_deployment)
119119

120-
# update deployment state
121-
self.deployment_store.insert_or_update(response.deployment)
120+
if response.status.status_code == status_pb2.Status.OK:
121+
# update deployment state
122+
if response and response.deployment:
123+
self.deployment_store.insert_or_update(response.deployment)
124+
else:
125+
raise BentoMLException(
126+
"DeploymentOperator Internal Error: Invalid Response"
127+
)
128+
logger.info(
129+
"ApplyDeployment (%s, namespace %s) succeeded",
130+
request.deployment.name,
131+
request.deployment.namespace,
132+
)
133+
else:
134+
if not previous_deployment:
135+
# When failed to create the deployment, delete it from active
136+
# deployments records
137+
self.deployment_store.delete(
138+
request.deployment.name, request.deployment.namespace
139+
)
140+
logger.debug(
141+
"ApplyDeployment (%s, namespace %s) failed: %s",
142+
request.deployment.name,
143+
request.deployment.namespace,
144+
response.status.error_message,
145+
)
122146

123147
return response
124148

@@ -299,6 +323,7 @@ def DangerouslyDeleteBento(self, request, context=None):
299323
self.bento_metadata_store.dangerously_delete(
300324
request.bento_name, request.bento_version
301325
)
326+
self.repo.dangerously_delete(request.bento_name, request.bento_version)
302327
except BentoMLException as e:
303328
logger.error("INTERNAL ERROR: %s", e)
304329
return DangerouslyDeleteBentoResponse(status=Status.INTERNAL(str(e)))

0 commit comments

Comments
 (0)