Load from checkpoint - not working #8

jesusbft · 2025-02-01T10:21:34Z

Trying Load from checkpoint. Using the exact same command that I trained just 1 epoch for test:

party -d cuda:0 --precision bf16-true train -q early --lag 20 --augment --optimizer AdamW8bit --freeze-encoder --warmup 5000 -r 0.0003 -w 0.0001 --schedule constant -B 90 --workers 24 --threads 20 -t dataset/train.lst -e dataset/val.lst -o models/Portuguese --load-from-checkpoint models/Portuguese/checkpoint_00-0.2598.ckpt
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
Trainer(val_check_interval=1.0) was configured so validation will run at the end of the training epoch..
Loading from checkpoint models/Portuguese/checkpoint_00-0.2598.ckpt.
╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮
│ /workspace/venv/bin/party:8 in │
│ │
│ 5 from party.cli import cli │
│ 6 if name == 'main': │
│ 7 │ sys.argv[0] = re.sub(r'(-script.pyw|.exe)?$', '', sys.argv[0]) │
│ ❱ 8 │ sys.exit(cli()) │
│ 9 │
│ │
│ /workspace/venv/lib/python3.11/site-packages/click/core.py:1161 in call │
│ │
│ /workspace/venv/lib/python3.11/site-packages/click/core.py:1082 in main │
│ │
│ /workspace/venv/lib/python3.11/site-packages/click/core.py:1697 in invoke │
│ │
│ /workspace/venv/lib/python3.11/site-packages/click/core.py:1443 in invoke │
│ │
│ /workspace/venv/lib/python3.11/site-packages/click/core.py:788 in invoke │
│ │
│ /workspace/venv/lib/python3.11/site-packages/click/decorators.py:33 in new_func │
│ │
│ /workspace/party/party/cli/train.py:348 in train │
│ │
│ 345 │ with trainer.init_module(): │
│ 346 │ │ if load_from_checkpoint: │
│ 347 │ │ │ message(f'Loading from checkpoint {load_from_checkpoint}.') │
│ ❱ 348 │ │ │ model = RecognitionModel.load_from_checkpoint(load_from_checkpoint, │
│ 349 │ │ │ │ │ │ │ │ │ │ │ │ │ │ **hyper_params) │
│ 350 │ │ elif load_from_repo: │
│ 351 │ │ │ message(f'Loading from huggingface hub {load_from_repo}.') │
│ │
│ /workspace/venv/lib/python3.11/site-packages/lightning/pytorch/utilities/model_helpers.py:125 in │
│ wrapper │
│ │
│ 122 │ │ │ │ │ f"The classmethod {cls.__name__}.{self.method.__name__} cannot be │
│ 123 │ │ │ │ │ " Please call it on the class type and make sure the return value is │
│ 124 │ │ │ │ ) │
│ ❱ 125 │ │ │ return self.method(cls, *args, **kwargs) │
│ 126 │ │ │
│ 127 │ │ return wrapper │
│ 128 │
│ │
│ /workspace/venv/lib/python3.11/site-packages/lightning/pytorch/core/module.py:1582 in │
│ load_from_checkpoint │
│ │
│ 1579 │ │ │ y_hat = pretrained_model(x) │
│ 1580 │ │ │
│ 1581 │ │ """ │
│ ❱ 1582 │ │ loaded = _load_from_checkpoint( │
│ 1583 │ │ │ cls, │
│ 1584 │ │ │ checkpoint_path, │
│ 1585 │ │ │ map_location, │
│ │
│ /workspace/venv/lib/python3.11/site-packages/lightning/pytorch/core/saving.py:63 in │
│ _load_from_checkpoint │
│ │
│ 60 ) -> Union["pl.LightningModule", "pl.LightningDataModule"]: │
│ 61 │ map_location = map_location or _default_map_location │
│ 62 │ with pl_legacy_patch(): │
│ ❱ 63 │ │ checkpoint = pl_load(checkpoint_path, map_location=map_location) │
│ 64 │ │
│ 65 │ # convert legacy checkpoints to the new format │
│ 66 │ checkpoint = _pl_migrate_checkpoint( │
│ │
│ /workspace/venv/lib/python3.11/site-packages/lightning/fabric/utilities/cloud_io.py:60 in _load │
│ │
│ 57 │ │ ) │
│ 58 │ fs = get_filesystem(path_or_url) │
│ 59 │ with fs.open(path_or_url, "rb") as f: │
│ ❱ 60 │ │ return torch.load( │
│ 61 │ │ │ f, │
│ 62 │ │ │ map_location=map_location, # type: ignore[arg-type] │
│ 63 │ │ │ weights_only=weights_only, │
│ │
│ /workspace/venv/lib/python3.11/site-packages/torch/serialization.py:1360 in load │
│ │
│ 1357 │ │ │ │ │ │ ) │
│ 1358 │ │ │ │ │ except pickle.UnpicklingError as e: │
│ 1359 │ │ │ │ │ │ raise pickle.UnpicklingError(_get_wo_message(str(e))) from None │
│ ❱ 1360 │ │ │ │ return _load( │
│ 1361 │ │ │ │ │ opened_zipfile, │
│ 1362 │ │ │ │ │ map_location, │
│ 1363 │ │ │ │ │ pickle_module, │
│ │
│ /workspace/venv/lib/python3.11/site-packages/torch/serialization.py:1848 in _load │
│ │
│ 1845 │ # not connected (wrapper subclasses and tensors rebuilt using numpy) │
│ 1846 │ global _serialization_tls │
│ 1847 │ _serialization_tls.map_location = map_location │
│ ❱ 1848 │ result = unpickler.load() │
│ 1849 │ _serialization_tls.map_location = None │
│ 1850 │ │
│ 1851 │ torch._utils._validate_loaded_sparse_tensors() │
│ │
│ /usr/lib/python3.11/pickle.py:1213 in load │
│ │
│ 1210 │ │ │ │ if not key: │
│ 1211 │ │ │ │ │ raise EOFError │
│ 1212 │ │ │ │ assert isinstance(key, bytes_types) │
│ ❱ 1213 │ │ │ │ dispatchkey[0] │
│ 1214 │ │ except _Stop as stopinst: │
│ 1215 │ │ │ return stopinst.value │
│ 1216 │
│ │
│ /usr/lib/python3.11/pickle.py:1590 in load_reduce │
│ │
│ 1587 │ │ stack = self.stack │
│ 1588 │ │ args = stack.pop() │
│ 1589 │ │ func = stack[-1] │
│ ❱ 1590 │ │ stack[-1] = func(*args) │
│ 1591 │ dispatch[REDUCE[0]] = load_reduce │
│ 1592 │ │
│ 1593 │ def load_pop(self): │
│ │
│ /workspace/venv/lib/python3.11/site-packages/torch/_tensor.py:57 in _rebuild_from_type_v2 │
│ │
│ 54 │
│ 55 │
│ 56 def _rebuild_from_type_v2(func, new_type, args, state): │
│ ❱ 57 │ ret = func(*args) │
│ 58 │ if type(ret) is not new_type: │
│ 59 │ │ ret = ret.as_subclass(new_type) │
│ 60 │ # Tensor does define setstate even though it doesn't define │
│ │
│ /workspace/venv/lib/python3.11/site-packages/torch/_utils.py:360 in _rebuild_wrapper_subclass │
│ │
│ 357 │ device, │
│ 358 │ requires_grad, │
│ 359 ): │
│ ❱ 360 │ device = _get_restore_location(device) │
│ 361 │ return torch.Tensor._make_wrapper_subclass( # type: ignore[attr-defined] │
│ 362 │ │ cls, │
│ 363 │ │ size, │
│ │
│ /workspace/venv/lib/python3.11/site-packages/torch/_utils.py:127 in _get_restore_location │
│ │
│ 124 │ │ │ return map_location │
│ 125 │ │ else: │
│ 126 │ │ │ assert callable(map_location) │
│ ❱ 127 │ │ │ raise RuntimeError( │
│ 128 │ │ │ │ "Callable map_location not supported with _rebuild_wrapper_subclass " │
│ 129 │ │ │ │ "or _rebuild_device_tensor_from_numpy" │
│ 130 │ │ │ ) │
╰──────────────────────────────────────────────────────────────────────────────────────────────────╯
RuntimeError: Callable map_location not supported with _rebuild_wrapper_subclass or _rebuild_device_tensor_from_numpy

The text was updated successfully, but these errors were encountered:

jesusbft · 2025-02-01T20:58:38Z

With different parameters (precision and optimizer), it's worker well:

party -d cuda:0 --precision bf16-mixed train -q early --lag 20 --augment --optimizer AdamW --freeze-encoder --warmup 5000 -r 0.0003 -w 0.0001 --schedule constant -B 32 --workers 16 --threads 16 -t dataset/train.lst -e dataset/val.lst -o models/Portuguese --load-from-checkpoint models/Portuguese/checkpoint_05-0.2234.ckpt
Using bfloat16 Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
Trainer(val_check_interval=1.0) was configured so validation will run at the end of the training epoch..
Loading from checkpoint models/Portuguese/checkpoint_05-0.2234.ckpt.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
┏━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━┓
┃ ┃ Name ┃ Type ┃ Params ┃ Mode ┃
┡━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━┩
│ 0 │ model │ OptimizedModule │ 214 M │ train │
│ 1 │ model._orig_mod │ PartyModel │ 214 M │ train │
│ 2 │ criterion │ CrossEntropyLoss │ 0 │ train │
│ 3 │ val_mean │ MeanMetric │ 0 │ train │
└───┴─────────────────┴──────────────────┴────────┴───────┘
Trainable params: 154 M
Non-trainable params: 59.5 M
Total params: 214 M
Total estimated model params size (MB): 856
Modules in train mode: 1078
Modules in eval mode: 0
Epoch 0/-2 ━━━━━━━━━24/6253 0:08:22 • 0:49:08 2.11it/s v_num: 4.000 train_loss: 0.120

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Load from checkpoint - not working #8

Load from checkpoint - not working #8

jesusbft commented Feb 1, 2025

jesusbft commented Feb 1, 2025

Load from checkpoint - not working #8

Load from checkpoint - not working #8

Comments

jesusbft commented Feb 1, 2025

jesusbft commented Feb 1, 2025