From c028a049b8eef61f93d6ffdbbb98361a7879632e Mon Sep 17 00:00:00 2001 From: surprisedPikachu007 Date: Mon, 26 May 2025 18:12:21 +0530 Subject: [PATCH] Add option to disable EOS appending in preprocessing (#5615) --- fairseq/options.py | 3 +++ fairseq_cli/preprocess.py | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/fairseq/options.py b/fairseq/options.py index 920591635a..8c60c6210e 100644 --- a/fairseq/options.py +++ b/fairseq/options.py @@ -305,6 +305,9 @@ def add_preprocess_args(parser): help="number of parallel workers") group.add_argument("--dict-only", action='store_true', help="if true, only builds a dictionary and then exits") + group.add_argument("--no-eos-append", action="store_false", + dest="append_eos", default=True, + help="Do not append EOS to the end of each document") # fmt: on return parser diff --git a/fairseq_cli/preprocess.py b/fairseq_cli/preprocess.py index 2ba9e09338..deb510177f 100644 --- a/fairseq_cli/preprocess.py +++ b/fairseq_cli/preprocess.py @@ -110,7 +110,7 @@ def _make_binary_dataset( binarizer = VocabularyDatasetBinarizer( vocab, - append_eos=True, + append_eos=args.append_eos, ) input_file = "{}{}".format(input_prefix, ("." + lang) if lang is not None else "")