Skip to content

Training stops when it finds a file with no text #741

@jordanxlau

Description

@jordanxlau

Bug description

Training stops/exits when it finds a file with no text. For example, when training on phonemes, the file LJ045-0058 looks like this:

LJ045-0058|eng|speaker_0|1963.|.

It has no 'text' because G2P resulted in no phonemes.

This is an issue with other files in LJ-Speech too, but I don't have them all.

How to reproduce the bug

If you create an EV project based on phonemized LJ speech and begin training (make sure to set target_text_representation_level: phones) you'll see the error I describe.

Error messages and logs

[rank0]: ╭───────────────────── Traceback (most recent call last) ──────────────────────╮
[rank0]: │ /gpfs/fs3c/nrc/dt/jol004/EveryVoice/everyvoice/model/feature_prediction/Fast │
[rank0]: │ Speech2_lightning/fs2/variance_adaptor.py:293 in forward │
[rank0]: │ │
[rank0]: │ 290 │ │ │ │ equal_dur_targets = torch.eq( │
[rank0]: │ 291 │ │ │ │ │ duration_target.sum(dim=1), batch["mel_lens"] │
[rank0]: │ 292 │ │ │ │ ) │
[rank0]: │ ❱ 293 │ │ │ │ assert torch.all(equal_dur_targets) │
[rank0]: │ 294 │ │ │ except AssertionError as e: │
[rank0]: │ 295 │ │ │ │ from itertools import compress │
[rank0]: │ 296 │
[rank0]: ╰──────────────────────────────────────────────────────────────────────────────╯
[rank0]: AssertionError

[rank0]: The above exception was the direct cause of the following exception:

[rank0]: ╭───────────────────── Traceback (most recent call last) ──────────────────────╮
[rank0]: │ /gpfs/fs3c/nrc/dt/jol004/EveryVoice/everyvoice/model/feature_prediction/Fast │
[rank0]: │ Speech2_lightning/fs2/cli/train.py:33 in train │
[rank0]: │ │
[rank0]: │ 30 │ │
[rank0]: │ 31 │ model_kwargs = {"lang2id": lang2id, "speaker2id": speaker2id, "stat │
[rank0]: │ 32 │ │
[rank0]: │ ❱ 33 │ train_base_command( │
[rank0]: │ 34 │ │ model_config=FastSpeech2Config, │
[rank0]: │ 35 │ │ model=FastSpeech2, │
[rank0]: │ 36 │ │ data_module=FastSpeech2DataModule, │
[rank0]: │ │
[rank0]: │ /gpfs/fs3c/nrc/dt/jol004/EveryVoice/everyvoice/base_cli/helpers.py:278 in │
[rank0]: │ train_base_command │
[rank0]: │ │
[rank0]: │ 275 │ │ model_obj = model(config, **model_kwargs) │
[rank0]: │ 276 │ │ logger.info(f"Model's architecture\n{model_obj}") │
[rank0]: │ 277 │ │ tensorboard_logger.log_hyperparams(config.model_dump()) │
[rank0]: │ ❱ 278 │ │ trainer.fit(model_obj, data) │
[rank0]: │ 279 │ else: │
[rank0]: │ 280 │ │ try: │
[rank0]: │ 281 │ │ │ model_obj = model.load_from_checkpoint(last_ckpt) │
[rank0]: │ │
[rank0]: │ /gpfs/fs3c/nrc/dt/jol004/EveryVoice/.venv/lib/python3.12/site-packages/pytor │
[rank0]: │ ch_lightning/trainer/trainer.py:561 in fit │
[rank0]: │ │
[rank0]: │ 558 │ │ self.state.status = TrainerStatus.RUNNING │
[rank0]: │ 559 │ │ self.training = True │
[rank0]: │ 560 │ │ self.should_stop = False │
[rank0]: │ ❱ 561 │ │ call._call_and_handle_interrupt( │
[rank0]: │ 562 │ │ │ self, self._fit_impl, model, train_dataloaders, val_datal │
[rank0]: │ 563 │ │ ) │
[rank0]: │ 564 │
[rank0]: │ │
[rank0]: │ /gpfs/fs3c/nrc/dt/jol004/EveryVoice/.venv/lib/python3.12/site-packages/pytor │
[rank0]: │ ch_lightning/trainer/call.py:47 in _call_and_handle_interrupt │
[rank0]: │ │
[rank0]: │ 44 │ """ │
[rank0]: │ 45 │ try: │
[rank0]: │ 46 │ │ if trainer.strategy.launcher is not None: │
[rank0]: │ ❱ 47 │ │ │ return trainer.strategy.launcher.launch(trainer_fn, *args, │
[rank0]: │ 48 │ │ return trainer_fn(*args, **kwargs) │
[rank0]: │ 49 │ │
[rank0]: │ 50 │ except _TunerExitException: │
[rank0]: │ │
[rank0]: │ /gpfs/fs3c/nrc/dt/jol004/EveryVoice/.venv/lib/python3.12/site-packages/pytor │
[rank0]: │ ch_lightning/strategies/launchers/subprocess_script.py:105 in launch │
[rank0]: │ │
[rank0]: │ 102 │ │ │ _launch_process_observer(self.procs) │
[rank0]: │ 103 │ │ │
[rank0]: │ 104 │ │ _set_num_threads_if_needed(num_processes=self.num_processes) │
[rank0]: │ ❱ 105 │ │ return function(*args, **kwargs) │
[rank0]: │ 106 │ │
[rank0]: │ 107 │ @OverRide
[rank0]: │ 108 │ def kill(self, signum: _SIGNUM) -> None: │
[rank0]: │ │
[rank0]: │ /gpfs/fs3c/nrc/dt/jol004/EveryVoice/.venv/lib/python3.12/site-packages/pytor │
[rank0]: │ ch_lightning/trainer/trainer.py:599 in _fit_impl │
[rank0]: │ │
[rank0]: │ 596 │ │ │ model_provided=True, │
[rank0]: │ 597 │ │ │ model_connected=self.lightning_module is not None, │
[rank0]: │ 598 │ │ ) │
[rank0]: │ ❱ 599 │ │ self._run(model, ckpt_path=ckpt_path) │
[rank0]: │ 600 │ │ │
[rank0]: │ 601 │ │ assert self.state.stopped │
[rank0]: │ 602 │ │ self.training = False │
[rank0]: │ │
[rank0]: │ /gpfs/fs3c/nrc/dt/jol004/EveryVoice/.venv/lib/python3.12/site-packages/pytor │
[rank0]: │ ch_lightning/trainer/trainer.py:1012 in _run │
[rank0]: │ │
[rank0]: │ 1009 │ │ # ---------------------------- │
[rank0]: │ 1010 │ │ # RUN THE TRAINER │
[rank0]: │ 1011 │ │ # ---------------------------- │
[rank0]: │ ❱ 1012 │ │ results = self._run_stage() │
[rank0]: │ 1013 │ │ │
[rank0]: │ 1014 │ │ # ---------------------------- │
[rank0]: │ 1015 │ │ # POST-Training CLEAN UP │
[rank0]: │ │
[rank0]: │ /gpfs/fs3c/nrc/dt/jol004/EveryVoice/.venv/lib/python3.12/site-packages/pytor │
[rank0]: │ ch_lightning/trainer/trainer.py:1056 in _run_stage │
[rank0]: │ │
[rank0]: │ 1053 │ │ │ with isolate_rng(): │
[rank0]: │ 1054 │ │ │ │ self._run_sanity_check() │
[rank0]: │ 1055 │ │ │ with torch.autograd.set_detect_anomaly(self._detect_anoma │
[rank0]: │ ❱ 1056 │ │ │ │ self.fit_loop.run() │
[rank0]: │ 1057 │ │ │ return None │
[rank0]: │ 1058 │ │ raise RuntimeError(f"Unexpected state {self.state}") │
[rank0]: │ 1059 │
[rank0]: │ │
[rank0]: │ /gpfs/fs3c/nrc/dt/jol004/EveryVoice/.venv/lib/python3.12/site-packages/pytor │
[rank0]: │ ch_lightning/loops/fit_loop.py:216 in run │
[rank0]: │ │
[rank0]: │ 213 │ │ while not self.done: │
[rank0]: │ 214 │ │ │ try: │
[rank0]: │ 215 │ │ │ │ self.on_advance_start() │
[rank0]: │ ❱ 216 │ │ │ │ self.advance() │
[rank0]: │ 217 │ │ │ │ self.on_advance_end() │
[rank0]: │ 218 │ │ │ except StopIteration: │
[rank0]: │ 219 │ │ │ │ break │
[rank0]: │ │
[rank0]: │ /gpfs/fs3c/nrc/dt/jol004/EveryVoice/.venv/lib/python3.12/site-packages/pytor │
[rank0]: │ ch_lightning/loops/fit_loop.py:455 in advance │
[rank0]: │ │
[rank0]: │ 452 │ │ │ ) │
[rank0]: │ 453 │ │ with self.trainer.profiler.profile("run_training_epoch"): │
[rank0]: │ 454 │ │ │ assert self._data_fetcher is not None │
[rank0]: │ ❱ 455 │ │ │ self.epoch_loop.run(self._data_fetcher) │
[rank0]: │ 456 │ │
[rank0]: │ 457 │ def on_advance_end(self) -> None: │
[rank0]: │ 458 │ │ trainer = self.trainer │
[rank0]: │ │
[rank0]: │ /gpfs/fs3c/nrc/dt/jol004/EveryVoice/.venv/lib/python3.12/site-packages/pytor │
[rank0]: │ ch_lightning/loops/training_epoch_loop.py:152 in run │
[rank0]: │ │
[rank0]: │ 149 │ │ self.on_run_start(data_fetcher) │
[rank0]: │ 150 │ │ while not self.done: │
[rank0]: │ 151 │ │ │ try: │
[rank0]: │ ❱ 152 │ │ │ │ self.advance(data_fetcher) │
[rank0]: │ 153 │ │ │ │ self.on_advance_end(data_fetcher) │
[rank0]: │ 154 │ │ │ except StopIteration: │
[rank0]: │ 155 │ │ │ │ break │
[rank0]: │ │
[rank0]: │ /gpfs/fs3c/nrc/dt/jol004/EveryVoice/.venv/lib/python3.12/site-packages/pytor │
[rank0]: │ ch_lightning/loops/training_epoch_loop.py:344 in advance │
[rank0]: │ │
[rank0]: │ 341 │ │ │ with trainer.profiler.profile("run_training_batch"): │
[rank0]: │ 342 │ │ │ │ if trainer.lightning_module.automatic_optimization: │
[rank0]: │ 343 │ │ │ │ │ # in automatic optimization, there can only be one │
[rank0]: │ ❱ 344 │ │ │ │ │ batch_output = self.automatic_optimization.run(tra │
[rank0]: │ 345 │ │ │ │ else: │
[rank0]: │ 346 │ │ │ │ │ batch_output = self.manual_optimization.run(kwargs │
[rank0]: │ 347 │
[rank0]: │ │
[rank0]: │ /gpfs/fs3c/nrc/dt/jol004/EveryVoice/.venv/lib/python3.12/site-packages/pytor │
[rank0]: │ ch_lightning/loops/optimization/automatic.py:192 in run │
[rank0]: │ │
[rank0]: │ 189 │ │ # ------------------------------ │
[rank0]: │ 190 │ │ # gradient update with accumulated gradients │
[rank0]: │ 191 │ │ else: │
[rank0]: │ ❱ 192 │ │ │ self._optimizer_step(batch_idx, closure) │
[rank0]: │ 193 │ │ │
[rank0]: │ 194 │ │ result = closure.consume_result() │
[rank0]: │ 195 │ │ if result.loss is None: │
[rank0]: │ │
[rank0]: │ /gpfs/fs3c/nrc/dt/jol004/EveryVoice/.venv/lib/python3.12/site-packages/pytor │
[rank0]: │ ch_lightning/loops/optimization/automatic.py:270 in _optimizer_step │
[rank0]: │ │
[rank0]: │ 267 │ │ │ self.optim_progress.optimizer.step.increment_ready() │
[rank0]: │ 268 │ │ │
[rank0]: │ 269 │ │ # model hook │
[rank0]: │ ❱ 270 │ │ call._call_lightning_module_hook( │
[rank0]: │ 271 │ │ │ trainer, │
[rank0]: │ 272 │ │ │ "optimizer_step", │
[rank0]: │ 273 │ │ │ trainer.current_epoch, │
[rank0]: │ │
[rank0]: │ /gpfs/fs3c/nrc/dt/jol004/EveryVoice/.venv/lib/python3.12/site-packages/pytor │
[rank0]: │ ch_lightning/trainer/call.py:176 in _call_lightning_module_hook │
[rank0]: │ │
[rank0]: │ 173 │ pl_module._current_fx_name = hook_name │
[rank0]: │ 174 │ │
[rank0]: │ 175 │ with trainer.profiler.profile(f"[LightningModule]{pl_module.__clas │
[rank0]: │ ❱ 176 │ │ output = fn(*args, **kwargs) │
[rank0]: │ 177 │ │
[rank0]: │ 178 │ # restore current_fx when nested context │
[rank0]: │ 179 │ pl_module._current_fx_name = prev_fx_name │
[rank0]: │ │
[rank0]: │ /gpfs/fs3c/nrc/dt/jol004/EveryVoice/.venv/lib/python3.12/site-packages/pytor │
[rank0]: │ ch_lightning/core/module.py:1328 in optimizer_step │
[rank0]: │ │
[rank0]: │ 1325 │ │ │ │ # Add your custom logic to run directly after optimi │ [rank0]: │ 1326 │ │ │ [rank0]: │ 1327 │ │ """ │ [rank0]: │ ❱ 1328 │ │ optimizer.step(closure=optimizer_closure) │ [rank0]: │ 1329 │ │ [rank0]: │ 1330 │ def optimizer_zero_grad(self, epoch: int, batch_idx: int, optimiz │ [rank0]: │ 1331 │ │ """Override this method to change the default behaviour of `` │ [rank0]: │ │ [rank0]: │ /gpfs/fs3c/nrc/dt/jol004/EveryVoice/.venv/lib/python3.12/site-packages/pytor │ [rank0]: │ ch_lightning/core/optimizer.py:154 in step │ [rank0]: │ │ [rank0]: │ 151 │ │ │ raise MisconfigurationException("When optimizer.step(clos │
[rank0]: │ 152 │ │ │
[rank0]: │ 153 │ │ assert self._strategy is not None │
[rank0]: │ ❱ 154 │ │ step_output = self._strategy.optimizer_step(self._optimizer, c │
[rank0]: │ 155 │ │ │
[rank0]: │ 156 │ │ self._on_after_step() │
[rank0]: │ 157 │
[rank0]: │ │
[rank0]: │ /gpfs/fs3c/nrc/dt/jol004/EveryVoice/.venv/lib/python3.12/site-packages/pytor │
[rank0]: │ ch_lightning/strategies/ddp.py:270 in optimizer_step │
[rank0]: │ │
[rank0]: │ 267 │ │ │ **kwargs: Any extra arguments to optimizer.step
[rank0]: │ 268 │ │ │
[rank0]: │ 269 │ │ """ │
[rank0]: │ ❱ 270 │ │ optimizer_output = super().optimizer_step(optimizer, closure, │
[rank0]: │ 271 │ │ │
[rank0]: │ 272 │ │ if self._model_averager is None: │
[rank0]: │ 273 │ │ │ return optimizer_output │
[rank0]: │ │
[rank0]: │ /gpfs/fs3c/nrc/dt/jol004/EveryVoice/.venv/lib/python3.12/site-packages/pytor │
[rank0]: │ ch_lightning/strategies/strategy.py:239 in optimizer_step │
[rank0]: │ │
[rank0]: │ 236 │ │ model = model or self.lightning_module │
[rank0]: │ 237 │ │ # TODO(fabric): remove assertion once strategy's optimizer_ste │
[rank0]: │ 238 │ │ assert isinstance(model, pl.LightningModule) │
[rank0]: │ ❱ 239 │ │ return self.precision_plugin.optimizer_step(optimizer, model=m │
[rank0]: │ 240 │ │
[rank0]: │ 241 │ def _setup_model_and_optimizers(self, model: Module, optimizers: l │
[rank0]: │ 242 │ │ """Setup a model and multiple optimizers together. │
[rank0]: │ │
[rank0]: │ /gpfs/fs3c/nrc/dt/jol004/EveryVoice/.venv/lib/python3.12/site-packages/pytor │
[rank0]: │ ch_lightning/plugins/precision/precision.py:123 in optimizer_step │
[rank0]: │ │
[rank0]: │ 120 │ ) -> Any: │
[rank0]: │ 121 │ │ """Hook to run the optimizer step.""" │
[rank0]: │ 122 │ │ closure = partial(self._wrap_closure, model, optimizer, closur │
[rank0]: │ ❱ 123 │ │ return optimizer.step(closure=closure, **kwargs) │
[rank0]: │ 124 │ │
[rank0]: │ 125 │ def _clip_gradients( │
[rank0]: │ 126 │ │ self, │
[rank0]: │ │
[rank0]: │ /gpfs/fs3c/nrc/dt/jol004/EveryVoice/.venv/lib/python3.12/site-packages/torch │
[rank0]: │ /optim/lr_scheduler.py:124 in wrapper │
[rank0]: │ │
[rank0]: │ 121 │ │ │ │ def wrapper(*args, **kwargs): │
[rank0]: │ 122 │ │ │ │ │ opt = opt_ref() │
[rank0]: │ 123 │ │ │ │ │ opt._opt_called = True # type: ignore[union-attr │
[rank0]: │ ❱ 124 │ │ │ │ │ return func.get(opt, opt.class)(*args, ** │
[rank0]: │ 125 │ │ │ │ │
[rank0]: │ 126 │ │ │ │ wrapper._wrapped_by_lr_sched = True # type: ignore[a │
[rank0]: │ 127 │ │ │ │ return wrapper │
[rank0]: │ │
[rank0]: │ /gpfs/fs3c/nrc/dt/jol004/EveryVoice/.venv/lib/python3.12/site-packages/torch │
[rank0]: │ /optim/optimizer.py:485 in wrapper │
[rank0]: │ │
[rank0]: │ 482 │ │ │ │ │ │ │ │ f"{func} must return None or a tuple │
[rank0]: │ 483 │ │ │ │ │ │ │ ) │
[rank0]: │ 484 │ │ │ │ │
[rank0]: │ ❱ 485 │ │ │ │ out = func(*args, **kwargs) │
[rank0]: │ 486 │ │ │ │ self._optimizer_step_code() │
[rank0]: │ 487 │ │ │ │ │
[rank0]: │ 488 │ │ │ │ # call optimizer step post hooks │
[rank0]: │ │
[rank0]: │ /gpfs/fs3c/nrc/dt/jol004/EveryVoice/.venv/lib/python3.12/site-packages/torch │
[rank0]: │ /optim/optimizer.py:79 in _use_grad │
[rank0]: │ │
[rank0]: │ 76 │ │ │ # see https://github.com/pytorch/pytorch/issues/104053
[rank0]: │ 77 │ │ │ torch.set_grad_enabled(self.defaults["differentiable"]) │
[rank0]: │ 78 │ │ │ torch._dynamo.graph_break() │
[rank0]: │ ❱ 79 │ │ │ ret = func(self, *args, **kwargs) │
[rank0]: │ 80 │ │ finally: │
[rank0]: │ 81 │ │ │ torch._dynamo.graph_break() │
[rank0]: │ 82 │ │ │ torch.set_grad_enabled(prev_grad) │
[rank0]: │ │
[rank0]: │ /gpfs/fs3c/nrc/dt/jol004/EveryVoice/.venv/lib/python3.12/site-packages/torch │
[rank0]: │ /optim/adam.py:225 in step │
[rank0]: │ │
[rank0]: │ 222 │ │ loss = None │
[rank0]: │ 223 │ │ if closure is not None: │
[rank0]: │ 224 │ │ │ with torch.enable_grad(): │
[rank0]: │ ❱ 225 │ │ │ │ loss = closure() │
[rank0]: │ 226 │ │ │
[rank0]: │ 227 │ │ for group in self.param_groups: │
[rank0]: │ 228 │ │ │ params_with_grad: list[Tensor] = [] │
[rank0]: │ │
[rank0]: │ /gpfs/fs3c/nrc/dt/jol004/EveryVoice/.venv/lib/python3.12/site-packages/pytor │
[rank0]: │ ch_lightning/plugins/precision/precision.py:109 in _wrap_closure │
[rank0]: │ │
[rank0]: │ 106 │ │ consistent with the Precision subclasses that cannot pass │
[rank0]: │ 107 │ │ │
[rank0]: │ 108 │ │ """ │
[rank0]: │ ❱ 109 │ │ closure_result = closure() │
[rank0]: │ 110 │ │ self._after_closure(model, optimizer) │
[rank0]: │ 111 │ │ return closure_result │
[rank0]: │ 112 │
[rank0]: │ │
[rank0]: │ /gpfs/fs3c/nrc/dt/jol004/EveryVoice/.venv/lib/python3.12/site-packages/pytor │
[rank0]: │ ch_lightning/loops/optimization/automatic.py:146 in call
[rank0]: │ │
[rank0]: │ 143 │ │
[rank0]: │ 144 │ @OverRide
[rank0]: │ 145 │ def call(self, *args: Any, **kwargs: Any) -> Optional[Tensor]: │
[rank0]: │ ❱ 146 │ │ self._result = self.closure(*args, **kwargs) │
[rank0]: │ 147 │ │ return self._result.loss │
[rank0]: │ 148 │
[rank0]: │ 149 │
[rank0]: │ │
[rank0]: │ /gpfs/fs3c/nrc/dt/jol004/EveryVoice/.venv/lib/python3.12/site-packages/torch │
[rank0]: │ /utils/_contextlib.py:116 in decorate_context │
[rank0]: │ │
[rank0]: │ 113 │ @functools.wraps(func) │
[rank0]: │ 114 │ def decorate_context(*args, **kwargs): │
[rank0]: │ 115 │ │ with ctx_factory(): │
[rank0]: │ ❱ 116 │ │ │ return func(*args, **kwargs) │
[rank0]: │ 117 │ │
[rank0]: │ 118 │ return decorate_context │
[rank0]: │ 119 │
[rank0]: │ │
[rank0]: │ /gpfs/fs3c/nrc/dt/jol004/EveryVoice/.venv/lib/python3.12/site-packages/pytor │
[rank0]: │ ch_lightning/loops/optimization/automatic.py:131 in closure │
[rank0]: │ │
[rank0]: │ 128 │ @OverRide
[rank0]: │ 129 │ @torch.enable_grad() │
[rank0]: │ 130 │ def closure(self, *args: Any, **kwargs: Any) -> ClosureResult: │
[rank0]: │ ❱ 131 │ │ step_output = self._step_fn() │
[rank0]: │ 132 │ │ │
[rank0]: │ 133 │ │ if step_output.closure_loss is None: │
[rank0]: │ 134 │ │ │ self.warning_cache.warn("training_step returned None. │
[rank0]: │ │
[rank0]: │ /gpfs/fs3c/nrc/dt/jol004/EveryVoice/.venv/lib/python3.12/site-packages/pytor │
[rank0]: │ ch_lightning/loops/optimization/automatic.py:319 in _training_step │
[rank0]: │ │
[rank0]: │ 316 │ │ """ │
[rank0]: │ 317 │ │ trainer = self.trainer │
[rank0]: │ 318 │ │ │
[rank0]: │ ❱ 319 │ │ training_step_output = call._call_strategy_hook(trainer, "trai │
[rank0]: │ 320 │ │ self.trainer.strategy.post_training_step() # unused hook - ca │
[rank0]: │ 321 │ │ │
[rank0]: │ 322 │ │ if training_step_output is None and trainer.world_size > 1: │
[rank0]: │ │
[rank0]: │ /gpfs/fs3c/nrc/dt/jol004/EveryVoice/.venv/lib/python3.12/site-packages/pytor │
[rank0]: │ ch_lightning/trainer/call.py:328 in _call_strategy_hook │
[rank0]: │ │
[rank0]: │ 325 │ │ return None │
[rank0]: │ 326 │ │
[rank0]: │ 327 │ with trainer.profiler.profile(f"[Strategy]{trainer.strategy.__clas │
[rank0]: │ ❱ 328 │ │ output = fn(*args, **kwargs) │
[rank0]: │ 329 │ │
[rank0]: │ 330 │ # restore current_fx when nested context │
[rank0]: │ 331 │ pl_module._current_fx_name = prev_fx_name │
[rank0]: │ │
[rank0]: │ /gpfs/fs3c/nrc/dt/jol004/EveryVoice/.venv/lib/python3.12/site-packages/pytor │
[rank0]: │ ch_lightning/strategies/strategy.py:390 in training_step │
[rank0]: │ │
[rank0]: │ 387 │ │ assert self.model is not None │
[rank0]: │ 388 │ │ with self.precision_plugin.train_step_context(): │
[rank0]: │ 389 │ │ │ if self.model != self.lightning_module: │
[rank0]: │ ❱ 390 │ │ │ │ return self._forward_redirection(self.model, self.ligh │
[rank0]: │ 391 │ │ │ return self.lightning_module.training_step(*args, **kwargs │
[rank0]: │ 392 │ │
[rank0]: │ 393 │ def post_training_step(self) -> None: │
[rank0]: │ │
[rank0]: │ /gpfs/fs3c/nrc/dt/jol004/EveryVoice/.venv/lib/python3.12/site-packages/pytor │
[rank0]: │ ch_lightning/strategies/strategy.py:641 in call
[rank0]: │ │
[rank0]: │ 638 │ │ # Patch the original_module's forward so we can redirect the a │
[rank0]: │ 639 │ │ original_module.forward = wrapped_forward # type: ignore[meth │
[rank0]: │ 640 │ │ │
[rank0]: │ ❱ 641 │ │ wrapper_output = wrapper_module(*args, **kwargs) │
[rank0]: │ 642 │ │ self.on_after_outer_forward(wrapper_module, original_module) │
[rank0]: │ 643 │ │ return wrapper_output │
[rank0]: │ 644 │
[rank0]: │ │
[rank0]: │ /gpfs/fs3c/nrc/dt/jol004/EveryVoice/.venv/lib/python3.12/site-packages/torch │
[rank0]: │ /nn/modules/module.py:1751 in _wrapped_call_impl │
[rank0]: │ │
[rank0]: │ 1748 │ │ if self._compiled_call_impl is not None: │
[rank0]: │ 1749 │ │ │ return self._compiled_call_impl(*args, **kwargs) # type: │
[rank0]: │ 1750 │ │ else: │
[rank0]: │ ❱ 1751 │ │ │ return self._call_impl(*args, **kwargs) │
[rank0]: │ 1752 │ │
[rank0]: │ 1753 │ # torchrec tests the code consistency with the following code │
[rank0]: │ 1754 │ # fmt: off │
[rank0]: │ │
[rank0]: │ /gpfs/fs3c/nrc/dt/jol004/EveryVoice/.venv/lib/python3.12/site-packages/torch │
[rank0]: │ /nn/modules/module.py:1762 in _call_impl │
[rank0]: │ │
[rank0]: │ 1759 │ │ if not (self._backward_hooks or self._backward_pre_hooks or s │
[rank0]: │ 1760 │ │ │ │ or _global_backward_pre_hooks or _global_backward_hoo │
[rank0]: │ 1761 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks │
[rank0]: │ ❱ 1762 │ │ │ return forward_call(*args, **kwargs) │
[rank0]: │ 1763 │ │ │
[rank0]: │ 1764 │ │ result = None │
[rank0]: │ 1765 │ │ called_always_called_hooks = set() │
[rank0]: │ │
[rank0]: │ /gpfs/fs3c/nrc/dt/jol004/EveryVoice/.venv/lib/python3.12/site-packages/torch │
[rank0]: │ /nn/parallel/distributed.py:1637 in forward │
[rank0]: │ │
[rank0]: │ 1634 │ │ │ output = ( │
[rank0]: │ 1635 │ │ │ │ self.module.forward(*inputs, **kwargs) │
[rank0]: │ 1636 │ │ │ │ if self._delay_all_reduce_all_params │
[rank0]: │ ❱ 1637 │ │ │ │ else self._run_ddp_forward(*inputs, **kwargs) │
[rank0]: │ 1638 │ │ │ ) │
[rank0]: │ 1639 │ │ │ return self._post_forward(output) │
[rank0]: │ 1640 │
[rank0]: │ │
[rank0]: │ /gpfs/fs3c/nrc/dt/jol004/EveryVoice/.venv/lib/python3.12/site-packages/torch │
[rank0]: │ /nn/parallel/distributed.py:1464 in _run_ddp_forward │
[rank0]: │ │
[rank0]: │ 1461 │ │ │ return self.module(*inputs, **kwargs) # type: ignore[ind │
[rank0]: │ 1462 │ │ else: │
[rank0]: │ 1463 │ │ │ with self._inside_ddp_forward(): │
[rank0]: │ ❱ 1464 │ │ │ │ return self.module(*inputs, **kwargs) # type: ignore │
[rank0]: │ 1465 │ │
[rank0]: │ 1466 │ def _clear_grad_buffer(self): │
[rank0]: │ 1467 │ │ # Making param.grad points to the grad buffers before backwar │
[rank0]: │ │
[rank0]: │ /gpfs/fs3c/nrc/dt/jol004/EveryVoice/.venv/lib/python3.12/site-packages/torch │
[rank0]: │ /nn/modules/module.py:1751 in _wrapped_call_impl │
[rank0]: │ │
[rank0]: │ 1748 │ │ if self._compiled_call_impl is not None: │
[rank0]: │ 1749 │ │ │ return self._compiled_call_impl(*args, **kwargs) # type: │
[rank0]: │ 1750 │ │ else: │
[rank0]: │ ❱ 1751 │ │ │ return self._call_impl(*args, **kwargs) │
[rank0]: │ 1752 │ │
[rank0]: │ 1753 │ # torchrec tests the code consistency with the following code │
[rank0]: │ 1754 │ # fmt: off │
[rank0]: │ │
[rank0]: │ /gpfs/fs3c/nrc/dt/jol004/EveryVoice/.venv/lib/python3.12/site-packages/torch │
[rank0]: │ /nn/modules/module.py:1762 in _call_impl │
[rank0]: │ │
[rank0]: │ 1759 │ │ if not (self._backward_hooks or self._backward_pre_hooks or s │
[rank0]: │ 1760 │ │ │ │ or _global_backward_pre_hooks or _global_backward_hoo │
[rank0]: │ 1761 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks │
[rank0]: │ ❱ 1762 │ │ │ return forward_call(args, **kwargs) │
[rank0]: │ 1763 │ │ │
[rank0]: │ 1764 │ │ result = None │
[rank0]: │ 1765 │ │ called_always_called_hooks = set() │
[rank0]: │ │
[rank0]: │ /gpfs/fs3c/nrc/dt/jol004/EveryVoice/.venv/lib/python3.12/site-packages/pytor │
[rank0]: │ ch_lightning/strategies/strategy.py:634 in wrapped_forward │
[rank0]: │ │
[rank0]: │ 631 │ │ │ original_module.forward = original_forward # type: ignore │
[rank0]: │ 632 │ │ │ # Call the actual method e.g. .training_step(...)
[rank0]: │ 633 │ │ │ method = getattr(original_module, method_name) │
[rank0]: │ ❱ 634 │ │ │ out = method(
_args, **_kwargs) │
[rank0]: │ 635 │ │ │ self.on_after_inner_forward(wrapper_module, original_modul │
[rank0]: │ 636 │ │ │ return out │
[rank0]: │ 637 │
[rank0]: │ │
[rank0]: │ /gpfs/fs3c/nrc/dt/jol004/EveryVoice/everyvoice/model/feature_prediction/Fast │
[rank0]: │ Speech2_lightning/fs2/model.py:387 in training_step │
[rank0]: │ │
[rank0]: │ 384 │ │ │ return self(batch, inference=True) │
[rank0]: │ 385 │ │
[rank0]: │ 386 │ def training_step(self, batch, batch_idx): │
[rank0]: │ ❱ 387 │ │ output = self(batch) │
[rank0]: │ 388 │ │ losses = self.loss(output, batch, self.current_epoch) │
[rank0]: │ 389 │ │ self.log_dict( │
[rank0]: │ 390 │ │ │ {f"training/{k}_loss": v.item() for k, v in losses.items() │
[rank0]: │ │
[rank0]: │ /gpfs/fs3c/nrc/dt/jol004/EveryVoice/.venv/lib/python3.12/site-packages/torch │
[rank0]: │ /nn/modules/module.py:1751 in _wrapped_call_impl │
[rank0]: │ │
[rank0]: │ 1748 │ │ if self._compiled_call_impl is not None: │
[rank0]: │ 1749 │ │ │ return self._compiled_call_impl(*args, **kwargs) # type: │
[rank0]: │ 1750 │ │ else: │
[rank0]: │ ❱ 1751 │ │ │ return self._call_impl(*args, **kwargs) │
[rank0]: │ 1752 │ │
[rank0]: │ 1753 │ # torchrec tests the code consistency with the following code │
[rank0]: │ 1754 │ # fmt: off │
[rank0]: │ │
[rank0]: │ /gpfs/fs3c/nrc/dt/jol004/EveryVoice/.venv/lib/python3.12/site-packages/torch │
[rank0]: │ /nn/modules/module.py:1762 in _call_impl │
[rank0]: │ │
[rank0]: │ 1759 │ │ if not (self._backward_hooks or self._backward_pre_hooks or s │
[rank0]: │ 1760 │ │ │ │ or _global_backward_pre_hooks or _global_backward_hoo │
[rank0]: │ 1761 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks │
[rank0]: │ ❱ 1762 │ │ │ return forward_call(*args, **kwargs) │
[rank0]: │ 1763 │ │ │
[rank0]: │ 1764 │ │ result = None │
[rank0]: │ 1765 │ │ called_always_called_hooks = set() │
[rank0]: │ │
[rank0]: │ /gpfs/fs3c/nrc/dt/jol004/EveryVoice/everyvoice/model/feature_prediction/Fast │
[rank0]: │ Speech2_lightning/fs2/model.py:218 in forward │
[rank0]: │ │
[rank0]: │ 215 │ │ │ x = x + lang_emb.unsqueeze(1) │
[rank0]: │ 216 │ │ │
[rank0]: │ 217 │ │ # VarianceAdaptor out │
[rank0]: │ ❱ 218 │ │ variance_adaptor_out = self.variance_adaptor( │
[rank0]: │ 219 │ │ │ inputs, │
[rank0]: │ 220 │ │ │ x, │
[rank0]: │ 221 │ │ │ batch, │
[rank0]: │ │
[rank0]: │ /gpfs/fs3c/nrc/dt/jol004/EveryVoice/.venv/lib/python3.12/site-packages/torch │
[rank0]: │ /nn/modules/module.py:1751 in _wrapped_call_impl │
[rank0]: │ │
[rank0]: │ 1748 │ │ if self._compiled_call_impl is not None: │
[rank0]: │ 1749 │ │ │ return self._compiled_call_impl(*args, **kwargs) # type: │
[rank0]: │ 1750 │ │ else: │
[rank0]: │ ❱ 1751 │ │ │ return self._call_impl(*args, **kwargs) │
[rank0]: │ 1752 │ │
[rank0]: │ 1753 │ # torchrec tests the code consistency with the following code │
[rank0]: │ 1754 │ # fmt: off │
[rank0]: │ │
[rank0]: │ /gpfs/fs3c/nrc/dt/jol004/EveryVoice/.venv/lib/python3.12/site-packages/torch │
[rank0]: │ /nn/modules/module.py:1762 in _call_impl │
[rank0]: │ │
[rank0]: │ 1759 │ │ if not (self._backward_hooks or self._backward_pre_hooks or s │
[rank0]: │ 1760 │ │ │ │ or _global_backward_pre_hooks or _global_backward_hoo │
[rank0]: │ 1761 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks │
[rank0]: │ ❱ 1762 │ │ │ return forward_call(*args, **kwargs) │
[rank0]: │ 1763 │ │ │
[rank0]: │ 1764 │ │ result = None │
[rank0]: │ 1765 │ │ called_always_called_hooks = set() │
[rank0]: │ │
[rank0]: │ /gpfs/fs3c/nrc/dt/jol004/EveryVoice/everyvoice/model/feature_prediction/Fast │
[rank0]: │ Speech2_lightning/fs2/variance_adaptor.py:302 in forward │
[rank0]: │ │
[rank0]: │ 299 │ │ │ │ │ │ batch["basename"], [not x for x in equal_dur_t │
[rank0]: │ 300 │ │ │ │ │ ) │
[rank0]: │ 301 │ │ │ │ ) │
[rank0]: │ ❱ 302 │ │ │ │ raise BadDataError( │
[rank0]: │ 303 │ │ │ │ │ f"Something failed with the following items, pleas │
[rank0]: │ 304 │ │ │ │ ) from e │
[rank0]: │ 305 │ │ │ │ sys.exit(1) │
[rank0]: ╰──────────────────────────────────────────────────────────────────────────────╯
[rank0]: BadDataError: Something failed with the following items, please check them for
[rank0]: errors: ['LJ045-0058']

Environment

Current environment
# Please paste the output of `everyvoice --diagnostic` here
# EveryVoice Diagnostic information

More info

No response

Metadata

Metadata

Assignees

No one assigned

    Labels

    bugSomething isn't working

    Type

    No type

    Projects

    No projects

    Milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions