From 769772630499e859ec91841f744cf044d92abaf4 Mon Sep 17 00:00:00 2001
From: Xingfu-Yi <ywtk000@gmail.com>
Date: Sun, 15 Sep 2024 11:31:20 +0800
Subject: [PATCH] Remove the fixed `eot_token` mechanism for SFT

Not all pretrained LLMs use `<|endoftext|>` as the `eot_token`, therefore it's inappropriate to fix it.
---
 .../training/step1_supervised_finetuning/main.py      | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/main.py b/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/main.py
index d9527af54..aa505a25d 100755
--- a/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/main.py
+++ b/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/main.py
@@ -191,7 +191,13 @@ def parse_args():
     parser.add_argument(
         "--add_eot_token",
         action='store_true',
-        help="Add <|endoftext|> as additional special token to tokenizer")
+        help="Add `eot_token` as additional special token to tokenizer")
+    parser.add_argument(
+        "--eot_token",
+        type=str,
+        default="<|endoftext|>",
+        help="Specify the format of the `eot_token`",
+    )
     ## Print loss
     parser.add_argument('--print_loss',
                         action='store_true',
@@ -234,8 +240,7 @@ def main():
     torch.distributed.barrier()
 
     # load_hf_tokenizer will get the correct tokenizer and set padding tokens based on the model family
-    args.end_of_conversation_token = "<|endoftext|>"
-    additional_special_tokens = args.end_of_conversation_token if args.add_eot_token else None
+    additional_special_tokens = args.eot_token if args.add_eot_token else None
     tokenizer = load_hf_tokenizer(args.model_name_or_path,
                                   fast_tokenizer=True,
                                   add_special_tokens=additional_special_tokens)