-
Notifications
You must be signed in to change notification settings - Fork 1.3k
Closed
Labels
bugSomething isn't workingSomething isn't workinggood first issueGood for newcomersGood for newcomers
Description
开始训练时报错:
Traceback (most recent call last):
File "/root/autodl-tmp/WeClone/.venv/bin/weclone-cli", line 10, in <module>
sys.exit(cli())
File "/root/autodl-tmp/WeClone/.venv/lib/python3.10/site-packages/click/core.py", line 1442, in __call__
return self.main(*args, **kwargs)
File "/root/autodl-tmp/WeClone/.venv/lib/python3.10/site-packages/click/core.py", line 1363, in main
rv = self.invoke(ctx)
File "/root/autodl-tmp/WeClone/.venv/lib/python3.10/site-packages/click/core.py", line 1830, in invoke
return _process_result(sub_ctx.command.invoke(sub_ctx))
File "/root/autodl-tmp/WeClone/.venv/lib/python3.10/site-packages/click/core.py", line 1226, in invoke
return ctx.invoke(self.callback, **ctx.params)
File "/root/autodl-tmp/WeClone/.venv/lib/python3.10/site-packages/click/core.py", line 794, in invoke
return callback(*args, **kwargs)
File "/root/autodl-tmp/WeClone/weclone/cli.py", line 36, in wrapper
return func(*args, **kwargs)
File "/root/autodl-tmp/WeClone/weclone/cli.py", line 67, in new_runtime_wrapper
return original_cmd_func(*args, **kwargs)
File "/root/autodl-tmp/WeClone/weclone/cli.py", line 120, in train_sft
train_sft_main()
File "/root/autodl-tmp/WeClone/weclone/train/train_sft.py", line 43, in main
run_exp(train_config.model_dump(mode="json"))
File "/root/autodl-tmp/WeClone/.venv/lib/python3.10/site-packages/llamafactory/train/tuner.py", line 110, in run_exp
_training_function(config={"args": args, "callbacks": callbacks})
File "/root/autodl-tmp/WeClone/.venv/lib/python3.10/site-packages/llamafactory/train/tuner.py", line 72, in _training_function
run_sft(model_args, data_args, training_args, finetuning_args, generating_args, callbacks)
File "/root/autodl-tmp/WeClone/.venv/lib/python3.10/site-packages/llamafactory/train/sft/workflow.py", line 96, in run_sft
train_result = trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint)
File "/root/autodl-tmp/WeClone/.venv/lib/python3.10/site-packages/transformers/trainer.py", line 2206, in train
return inner_training_loop(
File "/root/autodl-tmp/WeClone/.venv/lib/python3.10/site-packages/transformers/trainer.py", line 2502, in _inner_training_loop
batch_samples, num_items_in_batch = self.get_batch_samples(epoch_iterator, num_batches, args.device)
File "/root/autodl-tmp/WeClone/.venv/lib/python3.10/site-packages/transformers/trainer.py", line 5300, in get_batch_samples
batch_samples.append(next(epoch_iterator))
File "/root/autodl-tmp/WeClone/.venv/lib/python3.10/site-packages/accelerate/data_loader.py", line 566, in __iter__
current_batch = next(dataloader_iter)
File "/root/autodl-tmp/WeClone/.venv/lib/python3.10/site-packages/torch/utils/data/dataloader.py", line 733, in __next__
data = self._next_data()
File "/root/autodl-tmp/WeClone/.venv/lib/python3.10/site-packages/torch/utils/data/dataloader.py", line 1515, in _next_data
return self._process_data(data, worker_id)
File "/root/autodl-tmp/WeClone/.venv/lib/python3.10/site-packages/torch/utils/data/dataloader.py", line 1550, in _process_data
data.reraise()
File "/root/autodl-tmp/WeClone/.venv/lib/python3.10/site-packages/torch/_utils.py", line 750, in reraise
raise exception
RuntimeError: Caught RuntimeError in pin memory thread for device 0.
Original Traceback (most recent call last):
File "/root/autodl-tmp/WeClone/.venv/lib/python3.10/site-packages/torch/utils/data/_utils/pin_memory.py", line 43, in do_one_step
data = pin_memory(data, device)
File "/root/autodl-tmp/WeClone/.venv/lib/python3.10/site-packages/torch/utils/data/_utils/pin_memory.py", line 77, in pin_memory
{k: pin_memory(sample, device) for k, sample in data.items()}
File "/root/autodl-tmp/WeClone/.venv/lib/python3.10/site-packages/torch/utils/data/_utils/pin_memory.py", line 77, in <dictcomp>
{k: pin_memory(sample, device) for k, sample in data.items()}
File "/root/autodl-tmp/WeClone/.venv/lib/python3.10/site-packages/torch/utils/data/_utils/pin_memory.py", line 66, in pin_memory
return data.pin_memory(device)
RuntimeError: unsupported operation: more than one element of the written-to tensor refers to a single memory location. Please clone() the tensor before performing the operation.
Exception in thread Thread-9 (_pin_memory_loop):
Traceback (most recent call last):
File "/root/.local/share/uv/python/cpython-3.10.18-linux-x86_64-gnu/lib/python3.10/threading.py", line 1016, in _bootstrap_inner
self.run()
File "/root/.local/share/uv/python/cpython-3.10.18-linux-x86_64-gnu/lib/python3.10/threading.py", line 953, in run
self._target(*self._args, **self._kwargs)
File "/root/autodl-tmp/WeClone/.venv/lib/python3.10/site-packages/torch/utils/data/_utils/pin_memory.py", line 61, in _pin_memory_loop
do_one_step()
File "/root/autodl-tmp/WeClone/.venv/lib/python3.10/site-packages/torch/utils/data/_utils/pin_memory.py", line 37, in do_one_step
r = in_queue.get(timeout=MP_STATUS_CHECK_INTERVAL)
File "/root/.local/share/uv/python/cpython-3.10.18-linux-x86_64-gnu/lib/python3.10/multiprocessing/queues.py", line 122, in get
0%| | 0/210 [00:00<?, ?it/s]
Reactions are currently unavailable
Metadata
Metadata
Assignees
Labels
bugSomething isn't workingSomething isn't workinggood first issueGood for newcomersGood for newcomers