From 769772630499e859ec91841f744cf044d92abaf4 Mon Sep 17 00:00:00 2001 From: Xingfu-Yi Date: Sun, 15 Sep 2024 11:31:20 +0800 Subject: [PATCH] Remove the fixed `eot_token` mechanism for SFT Not all pretrained LLMs use `<|endoftext|>` as the `eot_token`, therefore it's inappropriate to fix it. --- .../training/step1_supervised_finetuning/main.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/main.py b/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/main.py index d9527af54..aa505a25d 100755 --- a/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/main.py +++ b/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/main.py @@ -191,7 +191,13 @@ def parse_args(): parser.add_argument( "--add_eot_token", action='store_true', - help="Add <|endoftext|> as additional special token to tokenizer") + help="Add `eot_token` as additional special token to tokenizer") + parser.add_argument( + "--eot_token", + type=str, + default="<|endoftext|>", + help="Specify the format of the `eot_token`", + ) ## Print loss parser.add_argument('--print_loss', action='store_true', @@ -234,8 +240,7 @@ def main(): torch.distributed.barrier() # load_hf_tokenizer will get the correct tokenizer and set padding tokens based on the model family - args.end_of_conversation_token = "<|endoftext|>" - additional_special_tokens = args.end_of_conversation_token if args.add_eot_token else None + additional_special_tokens = args.eot_token if args.add_eot_token else None tokenizer = load_hf_tokenizer(args.model_name_or_path, fast_tokenizer=True, add_special_tokens=additional_special_tokens)