Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions src/lightning/pytorch/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
- Fixed preventing recursive symlink creation iwhen `save_last='link'` and `save_top_k=-1` ([#21186](https://github.com/Lightning-AI/pytorch-lightning/pull/21186))


- Fixed `last.ckpt` being created and not linked to another checkpoint ([#21244](https://github.com/Lightning-AI/pytorch-lightning/pull/21244))

---

## [2.5.5] - 2025-09-05
Expand Down
12 changes: 10 additions & 2 deletions src/lightning/pytorch/callbacks/model_checkpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -380,7 +380,11 @@ def on_train_epoch_end(self, trainer: "pl.Trainer", pl_module: "pl.LightningModu
monitor_candidates = self._monitor_candidates(trainer)
if self._every_n_epochs >= 1 and (trainer.current_epoch + 1) % self._every_n_epochs == 0:
self._save_topk_checkpoint(trainer, monitor_candidates)
self._save_last_checkpoint(trainer, monitor_candidates)
# Only save last checkpoint if a checkpoint was actually saved in this step or if save_last="link"
if self._last_global_step_saved == trainer.global_step or (
self.save_last == "link" and self._last_checkpoint_saved
):
self._save_last_checkpoint(trainer, monitor_candidates)

@override
def on_validation_end(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None:
Expand All @@ -397,7 +401,11 @@ def on_validation_end(self, trainer: "pl.Trainer", pl_module: "pl.LightningModul

if self._every_n_epochs >= 1 and (trainer.current_epoch + 1) % self._every_n_epochs == 0:
self._save_topk_checkpoint(trainer, monitor_candidates)
self._save_last_checkpoint(trainer, monitor_candidates)
# Only save last checkpoint if a checkpoint was actually saved in this step or if save_last="link"
if self._last_global_step_saved == trainer.global_step or (
self.save_last == "link" and self._last_checkpoint_saved
):
self._save_last_checkpoint(trainer, monitor_candidates)

@override
def on_exception(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule", exception: BaseException) -> None:
Expand Down
56 changes: 56 additions & 0 deletions tests/tests_pytorch/checkpointing/test_model_checkpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -2124,3 +2124,59 @@ def test_save_last_without_save_on_train_epoch_and_without_val(tmp_path):

# save_last=True should always save last.ckpt
assert (tmp_path / "last.ckpt").exists()


def test_save_last_only_when_checkpoint_saved(tmp_path):
"""Test that save_last only creates last.ckpt when another checkpoint is actually saved."""

class SelectiveModel(BoringModel):
def __init__(self):
super().__init__()
self.validation_step_outputs = []

def validation_step(self, batch, batch_idx):
outputs = super().validation_step(batch, batch_idx)
epoch = self.trainer.current_epoch
loss = torch.tensor(1.0 - epoch * 0.1) if epoch % 2 == 0 else torch.tensor(1.0 + epoch * 0.1)
outputs["val_loss"] = loss
self.validation_step_outputs.append(outputs)
return outputs

def on_validation_epoch_end(self):
if self.validation_step_outputs:
avg_loss = torch.stack([x["val_loss"] for x in self.validation_step_outputs]).mean()
self.log("val_loss", avg_loss)
self.validation_step_outputs.clear()

model = SelectiveModel()

checkpoint_callback = ModelCheckpoint(
dirpath=tmp_path,
filename="best-{epoch}-{val_loss:.2f}",
monitor="val_loss",
save_last=True,
save_top_k=1,
mode="min",
every_n_epochs=1,
save_on_train_epoch_end=False,
)

trainer = Trainer(
max_epochs=4,
callbacks=[checkpoint_callback],
logger=False,
enable_progress_bar=False,
limit_train_batches=2,
limit_val_batches=2,
enable_checkpointing=True,
)

trainer.fit(model)

checkpoint_files = list(tmp_path.glob("*.ckpt"))
checkpoint_names = [f.name for f in checkpoint_files]
assert "last.ckpt" in checkpoint_names, "last.ckpt should exist since checkpoints were saved"
expected_files = 2 # best checkpoint + last.ckpt
assert len(checkpoint_files) == expected_files, (
f"Expected {expected_files} files, got {len(checkpoint_files)}: {checkpoint_names}"
)
Loading