openvinotoolkit · pavel-esir · Jan 9, 2025 · ilya-lavrenov · Jan 9, 2025 · ilya-lavrenov
diff --git a/src/cpp/include/openvino/genai/tokenizer.hpp b/src/cpp/include/openvino/genai/tokenizer.hpp
@@ -125,7 +125,9 @@ class OPENVINO_GENAI_EXPORTS Tokenizer {
     /**
     * @brief encode a single prompt
     * @param prompt std::string with input prompt
-    * @param properties tokenization properties, e.g. ov::genai::add_special_tokens(false)
+    * @param add_special_tokens whether to add special tokens
+    * @param max_length maximum length to which output will be padded or truncated
+    * @param padding_mode whether to pad result, allowed values are ["truncate", "longest", "max_length", "do_not_pad"]
     * @return pair of [input_ids, attention_mask]
     */
     template <typename... Properties>
@@ -136,7 +138,9 @@ class OPENVINO_GENAI_EXPORTS Tokenizer {
     /**
     * @brief encode batch of prompts. Left padding will be applied by default
     * @param prompts vector storing batch of prompts
-    * @param properties tokenization properties, e.g. ov::genai::add_special_tokens(false)
+    * @param add_special_tokens whether to add special tokens
+    * @param max_length maximum length to which output will be padded or truncated
+    * @param padding_mode whether to pad result, allowed values are ["truncate", "pad"]
     * @return pair of [input_ids, attention_mask]
     */
     template <typename... Properties>
@@ -240,6 +244,7 @@ class OPENVINO_GENAI_EXPORTS Tokenizer {
 
 static constexpr ov::Property<bool> add_special_tokens{"add_special_tokens"};
 static constexpr ov::Property<bool> skip_special_tokens{"skip_special_tokens"};
+static constexpr ov::Property<std::string> padding_mode{"padding_mode"};
 
 }  // namespace genai
 }  // namespace ov
diff --git a/src/cpp/src/tokenizer.cpp b/src/cpp/src/tokenizer.cpp
@@ -97,6 +97,8 @@ class Tokenizer::TokenizerImpl {
     // this flag holds the current state value of the CompiledModel.
     bool m_add_special_tokens = true;
     bool m_skip_special_tokens = true;
+    int m_max_pad_length = std::numeric_limits<int>::max();
+    int m_max_trunc_length = std::numeric_limits<int>::max();
     bool m_older_than_24_5 = false;
 
     int64_t m_pad_token_id = -1;
@@ -109,17 +111,47 @@ class Tokenizer::TokenizerImpl {
 
     std::string m_chat_template = {};
 
+    std::pair<int, int> get_padding_values(std::string padding_mode, size_t max_length) {
+        if (padding_mode == "truncate") {
+            return {max_length, std::numeric_limits<int32_t>::max()};
+        } else if (padding_mode == "longest") {
+            return {std::numeric_limits<int32_t>::max(), std::numeric_limits<int32_t>::max()};
+        } else if (padding_mode == "max_length") {
+            return {std::numeric_limits<int32_t>::max(), max_length};
+        } else if (padding_mode == "do_not_pad") {
+            // bahves exactly as longest
-            // bahves exactly as longest
+            // behaves exactly as longest
-            // bahves exactly as longest
+            // behaves exactly as longest
+            // TODO: need to find a way to disable padding automatically so that it will match to HF.
+            return {std::numeric_limits<int32_t>::max(), std::numeric_limits<int32_t>::max()};
+        } else {
+            OPENVINO_THROW("Unknown padding mode: " + padding_mode);
+        }
+    }
+
     void set_state_if_necessary(CircularBufferQueueElementGuard<ov::InferRequest>& infer_request_guard, const ov::AnyMap& params) {
         bool add_special_tokens_flag = m_add_special_tokens;
         bool skip_special_tokens_flag = m_skip_special_tokens;
-
+        size_t max_length_val;
+        std::string padding_mode_val;
+
         ov::genai::utils::read_anymap_param(params, add_special_tokens.name(), add_special_tokens_flag);
         ov::genai::utils::read_anymap_param(params, skip_special_tokens.name(), skip_special_tokens_flag);
+        ov::genai::utils::read_anymap_param(params, padding_mode.name(), padding_mode_val);
+        ov::genai::utils::read_anymap_param(params, max_length.name(), max_length_val);
+
+        int max_trunc_length_val = m_max_trunc_length;
+        int max_pad_length_val = m_max_pad_length;
+
+        if (!padding_mode_val.empty()) {
+            std::tie(max_trunc_length_val, max_pad_length_val) = get_padding_values(padding_mode_val, max_length_val);
+        }
 
         // If user requested add_special_tokens mode different from the current one,
         // need to set state variable.
         // If requested mode matches the stored state set, then don't touch states.
-        if (add_special_tokens_flag == m_add_special_tokens && skip_special_tokens_flag == m_skip_special_tokens) {
+        if (add_special_tokens_flag == m_add_special_tokens 
+            && skip_special_tokens_flag == m_skip_special_tokens
+            && max_trunc_length_val == m_max_trunc_length
+            && max_pad_length_val == m_max_pad_length) {
             return;
         }
         if (m_older_than_24_5) {
@@ -137,15 +169,26 @@ class Tokenizer::TokenizerImpl {
         ov::Tensor skip_special_tensor = ov::Tensor(ov::element::i32, {1});
         *skip_special_tensor.data<int>() = skip_special_tokens_flag;
 
+        ov::Tensor max_trunc_length_tensor = ov::Tensor(ov::element::i32, {1});
+        *max_trunc_length_tensor.data<int>() = max_trunc_length_val;
+        ov::Tensor max_pad_length_tensor = ov::Tensor(ov::element::i32, {1});
+        *max_pad_length_tensor.data<int>() = max_pad_length_val;
+
         for (auto& state: infer_request_guard.get().query_state()) {
-            if (state.get_name().find(ov::genai::ADD_SPECIAL_TOKENS_VAR_ID) != std::string::npos) {
+            if (state.get_name().find(add_special_tokens.name()) != std::string::npos) {
                 state.set_state(add_special_tensor);
-            } else if (state.get_name().find(ov::genai::SKIP_SPECIAL_TOKENS_VAR_ID) != std::string::npos) {
+            } else if (state.get_name().find(skip_special_tokens.name()) != std::string::npos) {
                 state.set_state(skip_special_tensor);
+            } else if (state.get_name().find("max_trunc_length") != std::string::npos) {
+                state.set_state(max_trunc_length_tensor);
+            } else if (state.get_name().find("max_pad_length") != std::string::npos) {
+                state.set_state(max_pad_length_tensor);
             }
         }
         m_add_special_tokens = add_special_tokens_flag;
         m_skip_special_tokens = skip_special_tokens_flag;
+        m_max_trunc_length = max_trunc_length_val;
+        m_max_pad_length = max_pad_length_val;
     }
 
     TokenizerImpl(const std::filesystem::path& models_path, const ov::AnyMap& properties) {
@@ -625,22 +668,22 @@ Tokenizer::Tokenizer(const std::string& model_str, ov::Tensor& weights_tensor, c
 }
 
 TokenizedInputs Tokenizer::encode(const std::string prompt, const ov::AnyMap& tokenization_params) {
-    check_arguments(tokenization_params, {ov::genai::add_special_tokens.name()});
+    check_arguments(tokenization_params, {ov::genai::add_special_tokens.name(), ov::genai::max_length.name(), ov::genai::padding_mode.name()});
     return m_pimpl->encode(std::move(prompt), tokenization_params);
 }
 
 TokenizedInputs Tokenizer::encode(std::vector<std::string>& prompts, const ov::AnyMap& tokenization_params) {
-    check_arguments(tokenization_params, {ov::genai::add_special_tokens.name()});
+    check_arguments(tokenization_params, {ov::genai::add_special_tokens.name(), ov::genai::max_length.name(), ov::genai::padding_mode.name()});
     return m_pimpl->encode(prompts, tokenization_params);
 }
 
 TokenizedInputs Tokenizer::encode(std::vector<std::string>&& prompts, const ov::AnyMap& tokenization_params) {
-    check_arguments(tokenization_params, {ov::genai::add_special_tokens.name()});
+    check_arguments(tokenization_params, {ov::genai::add_special_tokens.name(), ov::genai::max_length.name(), ov::genai::padding_mode.name()});
     return m_pimpl->encode(prompts, tokenization_params);
 }
 
 TokenizedInputs Tokenizer::encode(std::initializer_list<std::string>& text, const ov::AnyMap& tokenization_params) {
-    check_arguments(tokenization_params, {ov::genai::add_special_tokens.name()});
+    check_arguments(tokenization_params, {ov::genai::add_special_tokens.name(), ov::genai::max_length.name(), ov::genai::padding_mode.name()});
     return encode(std::vector<std::string>(text.begin(), text.end()), tokenization_params);
 }
 

diff --git a/src/python/openvino_genai/py_openvino_genai.pyi b/src/python/openvino_genai/py_openvino_genai.pyi
@@ -1656,12 +1656,12 @@ class Tokenizer:
         Decode a batch of tokens into a list of string prompt.
         """
     @typing.overload
-    def encode(self, prompts: list[str], add_special_tokens: bool = True) -> TokenizedInputs:
+    def encode(self, prompts: list[str], add_special_tokens: bool = True, max_length: int = 2147483647, padding_mode: str = 'truncate') -> TokenizedInputs:
         """
         Encodes a list of prompts into tokenized inputs.
         """
     @typing.overload
-    def encode(self, prompt: str, add_special_tokens: bool = True) -> TokenizedInputs:
+    def encode(self, prompt: str, add_special_tokens: bool = True, max_length: int = 2147483647, padding_mode: str = 'truncate') -> TokenizedInputs:
         """
         Encodes a single prompt into tokenized input.
         """

diff --git a/src/python/py_tokenizer.cpp b/src/python/py_tokenizer.cpp
@@ -44,21 +44,35 @@ void init_tokenizer(py::module_& m) {
             return std::make_unique<ov::genai::Tokenizer>(tokenizer_path, kwargs_properties);
         }), py::arg("tokenizer_path"), py::arg("properties") = ov::AnyMap({}))
 
-        .def("encode", [](Tokenizer& tok, std::vector<std::string>& prompts, bool add_special_tokens) {
+        .def("encode", [](Tokenizer& tok, std::vector<std::string>& prompts, 
+                          bool add_special_tokens, 
+                          size_t max_length,
+                          std::string padding_mode) {
                 ov::AnyMap tokenization_params;
                 tokenization_params[ov::genai::add_special_tokens.name()] = add_special_tokens;
+                tokenization_params[ov::genai::max_length.name()] = max_length;
+                tokenization_params[ov::genai::padding_mode.name()] = padding_mode;
                 return tok.encode(prompts, tokenization_params);
             },
             py::arg("prompts"),
             py::arg("add_special_tokens") = true,
+            py::arg("max_length") = std::numeric_limits<int>::max(),
+            py::arg("padding_mode") = "truncate",
             R"(Encodes a list of prompts into tokenized inputs.)")
 
-        .def("encode", [](Tokenizer& tok, const std::string prompt, bool add_special_tokens) {
+        .def("encode", [](Tokenizer& tok, const std::string prompt, 
+                          bool add_special_tokens, 
+                          size_t max_length,
+                          std::string padding_mode) {
                 ov::AnyMap tokenization_params;
                 tokenization_params[ov::genai::add_special_tokens.name()] = add_special_tokens;
+                tokenization_params[ov::genai::max_length.name()] = max_length;
+                tokenization_params[ov::genai::padding_mode.name()] = padding_mode;
                 return tok.encode(prompt, tokenization_params);
             },
-            py::arg("prompt"), py::arg("add_special_tokens") = true,
+            py::arg("prompt"), py::arg("add_special_tokens") = true, 
+            py::arg("max_length") = std::numeric_limits<int>::max(),
+            py::arg("padding_mode") = "truncate",
             R"(Encodes a single prompt into tokenized input.)")
 
         .def(

diff --git a/tests/python_tests/test_tokenizer.py b/tests/python_tests/test_tokenizer.py
@@ -236,6 +236,26 @@ def test_encode_decode_with_special_tokens_option(add_special_tokens, skip_speci
     decoded_hf = hf_tokenizer.decode(hf_res[0], skip_special_tokens=skip_special_tokens)
     assert decoded_genai == decoded_hf
 
+prompts = [
+    ['1+1=', 'What is the previous answer?']
+]
+@pytest.mark.precommit
+@pytest.mark.nightly
+@pytest.mark.parametrize("add_special_tokens", [True, False])
+@pytest.mark.parametrize("max_length", [10, 16, 64, 512])
+@pytest.mark.parametrize("pad_mode", ["truncate", "longest", "max_length", "do_not_pad"])
+@pytest.mark.parametrize("prompt", prompts)
+def test_padding(add_special_tokens, max_length, pad_mode, prompt):
+    import numpy as np
+    model_descr = get_chat_models_list()[0]
+    model_id, path, hf_tokenizer, model_opt, ov_pipe = read_model((model_descr[0], model_descr[1] / '_test_chat'))
+    genai_tokenzier = ov_pipe.get_tokenizer()
+
+    # Calling encode with 'add_special_tokens' will set state flag.
+    ov_res = genai_tokenzier.encode(prompt, add_special_tokens=add_special_tokens, max_length=max_length, padding_mode=pad_mode).input_ids.data
+    hf_res = hf_tokenizer(prompt, return_tensors="np", add_special_tokens=add_special_tokens, max_length=max_length, padding=pad_mode)["input_ids"]
+    assert np.all(ov_res == hf_res)
+
 
 @pytest.mark.precommit
 @pytest.mark.nightly