Skip to content

Commit

Permalink
Train launch script cleanup
Browse files Browse the repository at this point in the history
Signed-off-by: Kelly A <kellyaa@users.noreply.github.com>
  • Loading branch information
kellyaa committed May 7, 2024
1 parent f11b5b0 commit d085297
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 6 deletions.
15 changes: 11 additions & 4 deletions caikit/runtime/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,14 @@
log = alog.use_channel("TRAIN")
error = error_handler.get(log)

# The USER_ERROR_EXIT_CODE will be thrown when the process must exit
# as result of a user input error. User-related errors should be
# >= 1 and <=127 due to how some kubernetes operators interpret them.
USER_ERROR_EXIT_CODE = 1
INTERNAL_ERROR_EXIT_CODE = 200
# The INTERNAL_ERROR_EXIT_CODE will be thrown when training
# abnormally terminates, and it is not clearly fault of the user.
# System-level errors should be >= 128 and <= 254
INTERNAL_ERROR_EXIT_CODE = 203


class ArgumentParserError(Exception):
Expand All @@ -66,7 +72,8 @@ def write_termination_log(text: str, log_file="/dev/termination-log"):
except Exception as e:
log.warning(
"<COR96300323W>",
"Unable to write termination log due to error {}".format(e),
"Unable to write termination log due to error %s",
e,
)


Expand Down Expand Up @@ -193,7 +200,7 @@ def main() -> int:
"Unable to find module {} to train",
args.module,
)
except (ValueError, Exception):
except Exception:
message = "Unable to find module {} to train".format(args.module)
log.warning(
{
Expand Down Expand Up @@ -291,7 +298,7 @@ def main() -> int:
for err in info.errors or []:
log.error(err)
write_termination_log("Training finished unsuccessfully")
return INTERNAL_ERROR_EXIT_CODE
exit(INTERNAL_ERROR_EXIT_CODE)
except MemoryError:
message = "OOM error during training"
log.warning(
Expand Down
10 changes: 8 additions & 2 deletions tests/runtime/test_train.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,7 +235,10 @@ def test_failed_training():
"--training-kwargs",
json.dumps(training_kwargs),
):
assert main() == train.INTERNAL_ERROR_EXIT_CODE
with pytest.raises(SystemExit) as pytest_wrapped_e:
main()
assert pytest_wrapped_e.type == SystemExit
assert pytest_wrapped_e.value.code == train.INTERNAL_ERROR_EXIT_CODE


def test_bad_module():
Expand Down Expand Up @@ -319,4 +322,7 @@ def test_non_existent_save_path():
"--training-kwargs",
json.dumps(SAMPLE_TRAIN_KWARGS),
):
assert main() == train.INTERNAL_ERROR_EXIT_CODE
with pytest.raises(SystemExit) as pytest_wrapped_e:
main()
assert pytest_wrapped_e.type == SystemExit
assert pytest_wrapped_e.value.code == train.INTERNAL_ERROR_EXIT_CODE

0 comments on commit d085297

Please sign in to comment.