We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Trying Load from checkpoint. Using the exact same command that I trained just 1 epoch for test:
party -d cuda:0 --precision bf16-true train -q early --lag 20 --augment --optimizer AdamW8bit --freeze-encoder --warmup 5000 -r 0.0003 -w 0.0001 --schedule constant -B 90 --workers 24 --threads 20 -t dataset/train.lst -e dataset/val.lst -o models/Portuguese --load-from-checkpoint models/Portuguese/checkpoint_00-0.2598.ckpt GPU available: True (cuda), used: True TPU available: False, using: 0 TPU cores HPU available: False, using: 0 HPUs Trainer(val_check_interval=1.0) was configured so validation will run at the end of the training epoch.. Loading from checkpoint models/Portuguese/checkpoint_00-0.2598.ckpt. ╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮ │ /workspace/venv/bin/party:8 in │ │ │ │ 5 from party.cli import cli │ │ 6 if name == 'main': │ │ 7 │ sys.argv[0] = re.sub(r'(-script.pyw|.exe)?$', '', sys.argv[0]) │ │ ❱ 8 │ sys.exit(cli()) │ │ 9 │ │ │ │ /workspace/venv/lib/python3.11/site-packages/click/core.py:1161 in call │ │ │ │ /workspace/venv/lib/python3.11/site-packages/click/core.py:1082 in main │ │ │ │ /workspace/venv/lib/python3.11/site-packages/click/core.py:1697 in invoke │ │ │ │ /workspace/venv/lib/python3.11/site-packages/click/core.py:1443 in invoke │ │ │ │ /workspace/venv/lib/python3.11/site-packages/click/core.py:788 in invoke │ │ │ │ /workspace/venv/lib/python3.11/site-packages/click/decorators.py:33 in new_func │ │ │ │ /workspace/party/party/cli/train.py:348 in train │ │ │ │ 345 │ with trainer.init_module(): │ │ 346 │ │ if load_from_checkpoint: │ │ 347 │ │ │ message(f'Loading from checkpoint {load_from_checkpoint}.') │ │ ❱ 348 │ │ │ model = RecognitionModel.load_from_checkpoint(load_from_checkpoint, │ │ 349 │ │ │ │ │ │ │ │ │ │ │ │ │ │ **hyper_params) │ │ 350 │ │ elif load_from_repo: │ │ 351 │ │ │ message(f'Loading from huggingface hub {load_from_repo}.') │ │ │ │ /workspace/venv/lib/python3.11/site-packages/lightning/pytorch/utilities/model_helpers.py:125 in │ │ wrapper │ │ │ │ 122 │ │ │ │ │ f"The classmethod {cls.__name__}.{self.method.__name__} cannot be │ │ 123 │ │ │ │ │ " Please call it on the class type and make sure the return value is │ │ 124 │ │ │ │ ) │ │ ❱ 125 │ │ │ return self.method(cls, *args, **kwargs) │ │ 126 │ │ │ │ 127 │ │ return wrapper │ │ 128 │ │ │ │ /workspace/venv/lib/python3.11/site-packages/lightning/pytorch/core/module.py:1582 in │ │ load_from_checkpoint │ │ │ │ 1579 │ │ │ y_hat = pretrained_model(x) │ │ 1580 │ │ │ │ 1581 │ │ """ │ │ ❱ 1582 │ │ loaded = _load_from_checkpoint( │ │ 1583 │ │ │ cls, │ │ 1584 │ │ │ checkpoint_path, │ │ 1585 │ │ │ map_location, │ │ │ │ /workspace/venv/lib/python3.11/site-packages/lightning/pytorch/core/saving.py:63 in │ │ _load_from_checkpoint │ │ │ │ 60 ) -> Union["pl.LightningModule", "pl.LightningDataModule"]: │ │ 61 │ map_location = map_location or _default_map_location │ │ 62 │ with pl_legacy_patch(): │ │ ❱ 63 │ │ checkpoint = pl_load(checkpoint_path, map_location=map_location) │ │ 64 │ │ │ 65 │ # convert legacy checkpoints to the new format │ │ 66 │ checkpoint = _pl_migrate_checkpoint( │ │ │ │ /workspace/venv/lib/python3.11/site-packages/lightning/fabric/utilities/cloud_io.py:60 in _load │ │ │ │ 57 │ │ ) │ │ 58 │ fs = get_filesystem(path_or_url) │ │ 59 │ with fs.open(path_or_url, "rb") as f: │ │ ❱ 60 │ │ return torch.load( │ │ 61 │ │ │ f, │ │ 62 │ │ │ map_location=map_location, # type: ignore[arg-type] │ │ 63 │ │ │ weights_only=weights_only, │ │ │ │ /workspace/venv/lib/python3.11/site-packages/torch/serialization.py:1360 in load │ │ │ │ 1357 │ │ │ │ │ │ ) │ │ 1358 │ │ │ │ │ except pickle.UnpicklingError as e: │ │ 1359 │ │ │ │ │ │ raise pickle.UnpicklingError(_get_wo_message(str(e))) from None │ │ ❱ 1360 │ │ │ │ return _load( │ │ 1361 │ │ │ │ │ opened_zipfile, │ │ 1362 │ │ │ │ │ map_location, │ │ 1363 │ │ │ │ │ pickle_module, │ │ │ │ /workspace/venv/lib/python3.11/site-packages/torch/serialization.py:1848 in _load │ │ │ │ 1845 │ # not connected (wrapper subclasses and tensors rebuilt using numpy) │ │ 1846 │ global _serialization_tls │ │ 1847 │ _serialization_tls.map_location = map_location │ │ ❱ 1848 │ result = unpickler.load() │ │ 1849 │ _serialization_tls.map_location = None │ │ 1850 │ │ │ 1851 │ torch._utils._validate_loaded_sparse_tensors() │ │ │ │ /usr/lib/python3.11/pickle.py:1213 in load │ │ │ │ 1210 │ │ │ │ if not key: │ │ 1211 │ │ │ │ │ raise EOFError │ │ 1212 │ │ │ │ assert isinstance(key, bytes_types) │ │ ❱ 1213 │ │ │ │ dispatchkey[0] │ │ 1214 │ │ except _Stop as stopinst: │ │ 1215 │ │ │ return stopinst.value │ │ 1216 │ │ │ │ /usr/lib/python3.11/pickle.py:1590 in load_reduce │ │ │ │ 1587 │ │ stack = self.stack │ │ 1588 │ │ args = stack.pop() │ │ 1589 │ │ func = stack[-1] │ │ ❱ 1590 │ │ stack[-1] = func(*args) │ │ 1591 │ dispatch[REDUCE[0]] = load_reduce │ │ 1592 │ │ │ 1593 │ def load_pop(self): │ │ │ │ /workspace/venv/lib/python3.11/site-packages/torch/_tensor.py:57 in _rebuild_from_type_v2 │ │ │ │ 54 │ │ 55 │ │ 56 def _rebuild_from_type_v2(func, new_type, args, state): │ │ ❱ 57 │ ret = func(*args) │ │ 58 │ if type(ret) is not new_type: │ │ 59 │ │ ret = ret.as_subclass(new_type) │ │ 60 │ # Tensor does define setstate even though it doesn't define │ │ │ │ /workspace/venv/lib/python3.11/site-packages/torch/_utils.py:360 in _rebuild_wrapper_subclass │ │ │ │ 357 │ device, │ │ 358 │ requires_grad, │ │ 359 ): │ │ ❱ 360 │ device = _get_restore_location(device) │ │ 361 │ return torch.Tensor._make_wrapper_subclass( # type: ignore[attr-defined] │ │ 362 │ │ cls, │ │ 363 │ │ size, │ │ │ │ /workspace/venv/lib/python3.11/site-packages/torch/_utils.py:127 in _get_restore_location │ │ │ │ 124 │ │ │ return map_location │ │ 125 │ │ else: │ │ 126 │ │ │ assert callable(map_location) │ │ ❱ 127 │ │ │ raise RuntimeError( │ │ 128 │ │ │ │ "Callable map_location not supported with _rebuild_wrapper_subclass " │ │ 129 │ │ │ │ "or _rebuild_device_tensor_from_numpy" │ │ 130 │ │ │ ) │ ╰──────────────────────────────────────────────────────────────────────────────────────────────────╯ RuntimeError: Callable map_location not supported with _rebuild_wrapper_subclass or _rebuild_device_tensor_from_numpy
Trainer(val_check_interval=1.0)
{cls.__name__}.{self.method.__name__}
The text was updated successfully, but these errors were encountered:
With different parameters (precision and optimizer), it's worker well:
party -d cuda:0 --precision bf16-mixed train -q early --lag 20 --augment --optimizer AdamW --freeze-encoder --warmup 5000 -r 0.0003 -w 0.0001 --schedule constant -B 32 --workers 16 --threads 16 -t dataset/train.lst -e dataset/val.lst -o models/Portuguese --load-from-checkpoint models/Portuguese/checkpoint_05-0.2234.ckpt Using bfloat16 Automatic Mixed Precision (AMP) GPU available: True (cuda), used: True TPU available: False, using: 0 TPU cores HPU available: False, using: 0 HPUs Trainer(val_check_interval=1.0) was configured so validation will run at the end of the training epoch.. Loading from checkpoint models/Portuguese/checkpoint_05-0.2234.ckpt. LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0] ┏━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━┓ ┃ ┃ Name ┃ Type ┃ Params ┃ Mode ┃ ┡━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━┩ │ 0 │ model │ OptimizedModule │ 214 M │ train │ │ 1 │ model._orig_mod │ PartyModel │ 214 M │ train │ │ 2 │ criterion │ CrossEntropyLoss │ 0 │ train │ │ 3 │ val_mean │ MeanMetric │ 0 │ train │ └───┴─────────────────┴──────────────────┴────────┴───────┘ Trainable params: 154 M Non-trainable params: 59.5 M Total params: 214 M Total estimated model params size (MB): 856 Modules in train mode: 1078 Modules in eval mode: 0 Epoch 0/-2 ━━━━━━━━━24/6253 0:08:22 • 0:49:08 2.11it/s v_num: 4.000 train_loss: 0.120
Sorry, something went wrong.
No branches or pull requests
Trying Load from checkpoint. Using the exact same command that I trained just 1 epoch for test:
party -d cuda:0 --precision bf16-true train -q early --lag 20 --augment --optimizer AdamW8bit --freeze-encoder --warmup 5000 -r 0.0003 -w 0.0001 --schedule constant -B 90 --workers 24 --threads 20 -t dataset/train.lst -e dataset/val.lst -o models/Portuguese --load-from-checkpoint models/Portuguese/checkpoint_00-0.2598.ckpt
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
Trainer(val_check_interval=1.0)
was configured so validation will run at the end of the training epoch..Loading from checkpoint models/Portuguese/checkpoint_00-0.2598.ckpt.
╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮
│ /workspace/venv/bin/party:8 in │
│ │
│ 5 from party.cli import cli │
│ 6 if name == 'main': │
│ 7 │ sys.argv[0] = re.sub(r'(-script.pyw|.exe)?$', '', sys.argv[0]) │
│ ❱ 8 │ sys.exit(cli()) │
│ 9 │
│ │
│ /workspace/venv/lib/python3.11/site-packages/click/core.py:1161 in call │
│ │
│ /workspace/venv/lib/python3.11/site-packages/click/core.py:1082 in main │
│ │
│ /workspace/venv/lib/python3.11/site-packages/click/core.py:1697 in invoke │
│ │
│ /workspace/venv/lib/python3.11/site-packages/click/core.py:1443 in invoke │
│ │
│ /workspace/venv/lib/python3.11/site-packages/click/core.py:788 in invoke │
│ │
│ /workspace/venv/lib/python3.11/site-packages/click/decorators.py:33 in new_func │
│ │
│ /workspace/party/party/cli/train.py:348 in train │
│ │
│ 345 │ with trainer.init_module(): │
│ 346 │ │ if load_from_checkpoint: │
│ 347 │ │ │ message(f'Loading from checkpoint {load_from_checkpoint}.') │
│ ❱ 348 │ │ │ model = RecognitionModel.load_from_checkpoint(load_from_checkpoint, │
│ 349 │ │ │ │ │ │ │ │ │ │ │ │ │ │ **hyper_params) │
│ 350 │ │ elif load_from_repo: │
│ 351 │ │ │ message(f'Loading from huggingface hub {load_from_repo}.') │
│ │
│ /workspace/venv/lib/python3.11/site-packages/lightning/pytorch/utilities/model_helpers.py:125 in │
│ wrapper │
│ │
│ 122 │ │ │ │ │ f"The classmethod
{cls.__name__}.{self.method.__name__}
cannot be ││ 123 │ │ │ │ │ " Please call it on the class type and make sure the return value is │
│ 124 │ │ │ │ ) │
│ ❱ 125 │ │ │ return self.method(cls, *args, **kwargs) │
│ 126 │ │ │
│ 127 │ │ return wrapper │
│ 128 │
│ │
│ /workspace/venv/lib/python3.11/site-packages/lightning/pytorch/core/module.py:1582 in │
│ load_from_checkpoint │
│ │
│ 1579 │ │ │ y_hat = pretrained_model(x) │
│ 1580 │ │ │
│ 1581 │ │ """ │
│ ❱ 1582 │ │ loaded = _load_from_checkpoint( │
│ 1583 │ │ │ cls, │
│ 1584 │ │ │ checkpoint_path, │
│ 1585 │ │ │ map_location, │
│ │
│ /workspace/venv/lib/python3.11/site-packages/lightning/pytorch/core/saving.py:63 in │
│ _load_from_checkpoint │
│ │
│ 60 ) -> Union["pl.LightningModule", "pl.LightningDataModule"]: │
│ 61 │ map_location = map_location or _default_map_location │
│ 62 │ with pl_legacy_patch(): │
│ ❱ 63 │ │ checkpoint = pl_load(checkpoint_path, map_location=map_location) │
│ 64 │ │
│ 65 │ # convert legacy checkpoints to the new format │
│ 66 │ checkpoint = _pl_migrate_checkpoint( │
│ │
│ /workspace/venv/lib/python3.11/site-packages/lightning/fabric/utilities/cloud_io.py:60 in _load │
│ │
│ 57 │ │ ) │
│ 58 │ fs = get_filesystem(path_or_url) │
│ 59 │ with fs.open(path_or_url, "rb") as f: │
│ ❱ 60 │ │ return torch.load( │
│ 61 │ │ │ f, │
│ 62 │ │ │ map_location=map_location, # type: ignore[arg-type] │
│ 63 │ │ │ weights_only=weights_only, │
│ │
│ /workspace/venv/lib/python3.11/site-packages/torch/serialization.py:1360 in load │
│ │
│ 1357 │ │ │ │ │ │ ) │
│ 1358 │ │ │ │ │ except pickle.UnpicklingError as e: │
│ 1359 │ │ │ │ │ │ raise pickle.UnpicklingError(_get_wo_message(str(e))) from None │
│ ❱ 1360 │ │ │ │ return _load( │
│ 1361 │ │ │ │ │ opened_zipfile, │
│ 1362 │ │ │ │ │ map_location, │
│ 1363 │ │ │ │ │ pickle_module, │
│ │
│ /workspace/venv/lib/python3.11/site-packages/torch/serialization.py:1848 in _load │
│ │
│ 1845 │ # not connected (wrapper subclasses and tensors rebuilt using numpy) │
│ 1846 │ global _serialization_tls │
│ 1847 │ _serialization_tls.map_location = map_location │
│ ❱ 1848 │ result = unpickler.load() │
│ 1849 │ _serialization_tls.map_location = None │
│ 1850 │ │
│ 1851 │ torch._utils._validate_loaded_sparse_tensors() │
│ │
│ /usr/lib/python3.11/pickle.py:1213 in load │
│ │
│ 1210 │ │ │ │ if not key: │
│ 1211 │ │ │ │ │ raise EOFError │
│ 1212 │ │ │ │ assert isinstance(key, bytes_types) │
│ ❱ 1213 │ │ │ │ dispatchkey[0] │
│ 1214 │ │ except _Stop as stopinst: │
│ 1215 │ │ │ return stopinst.value │
│ 1216 │
│ │
│ /usr/lib/python3.11/pickle.py:1590 in load_reduce │
│ │
│ 1587 │ │ stack = self.stack │
│ 1588 │ │ args = stack.pop() │
│ 1589 │ │ func = stack[-1] │
│ ❱ 1590 │ │ stack[-1] = func(*args) │
│ 1591 │ dispatch[REDUCE[0]] = load_reduce │
│ 1592 │ │
│ 1593 │ def load_pop(self): │
│ │
│ /workspace/venv/lib/python3.11/site-packages/torch/_tensor.py:57 in _rebuild_from_type_v2 │
│ │
│ 54 │
│ 55 │
│ 56 def _rebuild_from_type_v2(func, new_type, args, state): │
│ ❱ 57 │ ret = func(*args) │
│ 58 │ if type(ret) is not new_type: │
│ 59 │ │ ret = ret.as_subclass(new_type) │
│ 60 │ # Tensor does define setstate even though it doesn't define │
│ │
│ /workspace/venv/lib/python3.11/site-packages/torch/_utils.py:360 in _rebuild_wrapper_subclass │
│ │
│ 357 │ device, │
│ 358 │ requires_grad, │
│ 359 ): │
│ ❱ 360 │ device = _get_restore_location(device) │
│ 361 │ return torch.Tensor._make_wrapper_subclass( # type: ignore[attr-defined] │
│ 362 │ │ cls, │
│ 363 │ │ size, │
│ │
│ /workspace/venv/lib/python3.11/site-packages/torch/_utils.py:127 in _get_restore_location │
│ │
│ 124 │ │ │ return map_location │
│ 125 │ │ else: │
│ 126 │ │ │ assert callable(map_location) │
│ ❱ 127 │ │ │ raise RuntimeError( │
│ 128 │ │ │ │ "Callable map_location not supported with _rebuild_wrapper_subclass " │
│ 129 │ │ │ │ "or _rebuild_device_tensor_from_numpy" │
│ 130 │ │ │ ) │
╰──────────────────────────────────────────────────────────────────────────────────────────────────╯
RuntimeError: Callable map_location not supported with _rebuild_wrapper_subclass or _rebuild_device_tensor_from_numpy
The text was updated successfully, but these errors were encountered: