Skip to content

Commit

Permalink
Write .complete file regardless off succesfull training
Browse files Browse the repository at this point in the history
Signed-off-by: Kelly A <kellyaa@users.noreply.github.com>
  • Loading branch information
kellyaa committed Jun 11, 2024
1 parent c7e00e9 commit 372f29c
Showing 1 changed file with 15 additions and 10 deletions.
25 changes: 15 additions & 10 deletions caikit/runtime/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,12 @@ def main() -> int:
"""Main entrypoint for running training jobs"""
parser = TrainArgumentParser(description=__doc__)

# Set default values for termination log incase parsing the arguments fail later on
enable_termination_log = os.environ.get("ENABLE_TERMINATION_LOG", True)
termination_log_file = os.environ.get(
"TERMINATION_LOG_FILE", "/dev/termination-log"
)

# Required Args
parser.add_argument(
"--training-kwargs",
Expand Down Expand Up @@ -133,18 +139,15 @@ def main() -> int:
parser.add_argument(
"--termination-log-file",
"-f",
default=termination_log_file,
help="Location of where to write a termination error message",
)
parser.add_argument(
"--enable-termination-log",
"-e",
default=enable_termination_log,
help="Whether to enable writing to termination log when training fails",
)
# Set default values for termination log incase parsing the arguments fail later on
enable_termination_log = os.environ.get("ENABLE_TERMINATION_LOG", True)
termination_log_file = os.environ.get(
"TERMINATION_LOG_FILE", "/dev/termination-log"
)

try:
args = parser.parse_args()
Expand Down Expand Up @@ -311,13 +314,15 @@ def main() -> int:
args.model_name,
):
future = train(module, wait=True, **train_kwargs)

# The .complete file indicates all files have completed
# being written to the filesystem
complete_path = os.path.join(args.save_path, ".complete")
log.info(f"Creating completion file at: {complete_path}")
Path(complete_path).touch()

info = future.get_info()
if info.status == TrainingStatus.COMPLETED:
# The .complete file indicates all model files have completed
# being written to the filesystem
complete_path = os.path.join(args.save_path, ".complete")
log.info(f"Creating completion file at: {complete_path}")
Path(complete_path).touch()
log.info("Training finished successfully")
return 0
else:
Expand Down

0 comments on commit 372f29c

Please sign in to comment.