diff --git a/samples/python/chat_sample/chat_sample.py b/samples/python/chat_sample/chat_sample.py index 8a6cc25ac2..10e5a3b869 100755 --- a/samples/python/chat_sample/chat_sample.py +++ b/samples/python/chat_sample/chat_sample.py @@ -8,7 +8,6 @@ def streamer(subword): print(subword, end='', flush=True) - return openvino_genai.StreamerRunningStatus.RUNNING def main(): parser = argparse.ArgumentParser() diff --git a/samples/python/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.py b/samples/python/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.py index 9325a77d00..54af23e434 100755 --- a/samples/python/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.py +++ b/samples/python/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.py @@ -8,7 +8,8 @@ def streamer(subword): print(subword, end='', flush=True) # Return flag corresponds whether generation should be stopped. - return openvino_genai.StreamerRunningStatus.RUNNING + # False means continue generation. + return False def main(): parser = argparse.ArgumentParser() diff --git a/samples/python/speculative_decoding_lm/speculative_decoding_lm.py b/samples/python/speculative_decoding_lm/speculative_decoding_lm.py index 7f82f9e1b7..740d9b589d 100755 --- a/samples/python/speculative_decoding_lm/speculative_decoding_lm.py +++ b/samples/python/speculative_decoding_lm/speculative_decoding_lm.py @@ -10,7 +10,8 @@ def streamer(subword): print(subword, end='', flush=True) # Return flag corresponds whether generation should be stopped. - return openvino_genai.StreamerRunningStatus.RUNNING + # False means continue generation. + return False def main(): parser = argparse.ArgumentParser() diff --git a/samples/python/visual_language_chat/visual_language_chat.py b/samples/python/visual_language_chat/visual_language_chat.py index aa621d18df..c85c41d826 100755 --- a/samples/python/visual_language_chat/visual_language_chat.py +++ b/samples/python/visual_language_chat/visual_language_chat.py @@ -11,21 +11,18 @@ from pathlib import Path -def streamer(subword: str) -> openvino_genai.StreamerRunningStatus: +def streamer(subword: str): ''' Args: subword: sub-word of the generated text. Returns: Return flag corresponds whether generation should be stopped. + "return None" will be treated the same as "return False". ''' print(subword, end='', flush=True) - # No value is returned as in this example we don't want to stop the generation in this method. - # "return None" will be treated the same as "return False". - - return openvino_genai.StreamerRunningStatus.RUNNING def read_image(path: str) -> Tensor: diff --git a/src/cpp/src/visual_language/pipeline.cpp b/src/cpp/src/visual_language/pipeline.cpp index 82bf0a544d..be2a677625 100644 --- a/src/cpp/src/visual_language/pipeline.cpp +++ b/src/cpp/src/visual_language/pipeline.cpp @@ -187,6 +187,9 @@ class ov::genai::VLMPipeline::VLMPipelineImpl { SequenceGroup::Ptr sequence_group = std::make_shared(request_id, prompt_ids, generation_config, block_size); requests.push_back(sequence_group); + OPENVINO_ASSERT((!m_is_chat_conversation || !std::get_if>(&streamer)), + "For chat mode, please, use Steamer as StreamerBase class or as callback with a bool return value."); + std::shared_ptr streamer_ptr = std::visit(overloaded{ [&m_tokenizer = m_tokenizer]( const std::function& callback diff --git a/src/python/openvino_genai/__init__.py b/src/python/openvino_genai/__init__.py index 1a1e5cfc12..5f33ebc8e4 100644 --- a/src/python/openvino_genai/__init__.py +++ b/src/python/openvino_genai/__init__.py @@ -15,8 +15,7 @@ RawPerfMetrics, PerfMetrics, StreamerBase, - get_version, - StreamerRunningStatus + get_version ) __version__ = get_version() diff --git a/src/python/openvino_genai/__init__.pyi b/src/python/openvino_genai/__init__.pyi index e54d898b2e..0a401ae958 100644 --- a/src/python/openvino_genai/__init__.pyi +++ b/src/python/openvino_genai/__init__.pyi @@ -30,7 +30,6 @@ from openvino_genai.py_openvino_genai import Scheduler from openvino_genai.py_openvino_genai import SchedulerConfig from openvino_genai.py_openvino_genai import StopCriteria from openvino_genai.py_openvino_genai import StreamerBase -from openvino_genai.py_openvino_genai import StreamerRunningStatus from openvino_genai.py_openvino_genai import T5EncoderModel from openvino_genai.py_openvino_genai import Text2ImagePipeline from openvino_genai.py_openvino_genai import TokenizedInputs @@ -46,5 +45,5 @@ from openvino_genai.py_openvino_genai import draft_model from openvino_genai.py_openvino_genai import get_version import os as os from . import py_openvino_genai -__all__ = ['Adapter', 'AdapterConfig', 'AggregationMode', 'AutoencoderKL', 'CLIPTextModel', 'CLIPTextModelWithProjection', 'CacheEvictionConfig', 'ChunkStreamerBase', 'ContinuousBatchingPipeline', 'CppStdGenerator', 'DecodedResults', 'EncodedResults', 'FluxTransformer2DModel', 'GenerationConfig', 'GenerationResult', 'Generator', 'Image2ImagePipeline', 'ImageGenerationConfig', 'InpaintingPipeline', 'LLMPipeline', 'PerfMetrics', 'RawPerfMetrics', 'SD3Transformer2DModel', 'Scheduler', 'SchedulerConfig', 'StopCriteria', 'StreamerBase', 'StreamerRunningStatus', 'T5EncoderModel', 'Text2ImagePipeline', 'TokenizedInputs', 'Tokenizer', 'TorchGenerator', 'UNet2DConditionModel', 'VLMPipeline', 'WhisperGenerationConfig', 'WhisperPerfMetrics', 'WhisperPipeline', 'WhisperRawPerfMetrics', 'draft_model', 'get_version', 'openvino', 'os', 'py_openvino_genai'] +__all__ = ['Adapter', 'AdapterConfig', 'AggregationMode', 'AutoencoderKL', 'CLIPTextModel', 'CLIPTextModelWithProjection', 'CacheEvictionConfig', 'ChunkStreamerBase', 'ContinuousBatchingPipeline', 'CppStdGenerator', 'DecodedResults', 'EncodedResults', 'FluxTransformer2DModel', 'GenerationConfig', 'GenerationResult', 'Generator', 'Image2ImagePipeline', 'ImageGenerationConfig', 'InpaintingPipeline', 'LLMPipeline', 'PerfMetrics', 'RawPerfMetrics', 'SD3Transformer2DModel', 'Scheduler', 'SchedulerConfig', 'StopCriteria', 'StreamerBase', 'T5EncoderModel', 'Text2ImagePipeline', 'TokenizedInputs', 'Tokenizer', 'TorchGenerator', 'UNet2DConditionModel', 'VLMPipeline', 'WhisperGenerationConfig', 'WhisperPerfMetrics', 'WhisperPipeline', 'WhisperRawPerfMetrics', 'draft_model', 'get_version', 'openvino', 'os', 'py_openvino_genai'] __version__: str diff --git a/src/python/openvino_genai/py_openvino_genai.pyi b/src/python/openvino_genai/py_openvino_genai.pyi index f46431d6e8..5adbd0859f 100644 --- a/src/python/openvino_genai/py_openvino_genai.pyi +++ b/src/python/openvino_genai/py_openvino_genai.pyi @@ -5,7 +5,7 @@ from __future__ import annotations import openvino._pyopenvino import os import typing -__all__ = ['Adapter', 'AdapterConfig', 'AggregationMode', 'AutoencoderKL', 'CLIPTextModel', 'CLIPTextModelWithProjection', 'CacheEvictionConfig', 'ChunkStreamerBase', 'ContinuousBatchingPipeline', 'CppStdGenerator', 'DecodedResults', 'EncodedGenerationResult', 'EncodedResults', 'FluxTransformer2DModel', 'GenerationConfig', 'GenerationFinishReason', 'GenerationHandle', 'GenerationOutput', 'GenerationResult', 'GenerationStatus', 'Generator', 'Image2ImagePipeline', 'ImageGenerationConfig', 'InpaintingPipeline', 'LLMPipeline', 'MeanStdPair', 'PerfMetrics', 'PipelineMetrics', 'RawPerfMetrics', 'SD3Transformer2DModel', 'Scheduler', 'SchedulerConfig', 'StopCriteria', 'StreamerBase', 'StreamerRunningStatus', 'T5EncoderModel', 'Text2ImagePipeline', 'TokenizedInputs', 'Tokenizer', 'TorchGenerator', 'UNet2DConditionModel', 'VLMDecodedResults', 'VLMPerfMetrics', 'VLMPipeline', 'VLMRawPerfMetrics', 'WhisperDecodedResultChunk', 'WhisperDecodedResults', 'WhisperGenerationConfig', 'WhisperPerfMetrics', 'WhisperPipeline', 'WhisperRawPerfMetrics', 'draft_model', 'get_version'] +__all__ = ['Adapter', 'AdapterConfig', 'AggregationMode', 'AutoencoderKL', 'CLIPTextModel', 'CLIPTextModelWithProjection', 'CacheEvictionConfig', 'ChunkStreamerBase', 'ContinuousBatchingPipeline', 'CppStdGenerator', 'DecodedResults', 'EncodedGenerationResult', 'EncodedResults', 'FluxTransformer2DModel', 'GenerationConfig', 'GenerationFinishReason', 'GenerationHandle', 'GenerationOutput', 'GenerationResult', 'GenerationStatus', 'Generator', 'Image2ImagePipeline', 'ImageGenerationConfig', 'InpaintingPipeline', 'LLMPipeline', 'MeanStdPair', 'PerfMetrics', 'PipelineMetrics', 'RawPerfMetrics', 'SD3Transformer2DModel', 'Scheduler', 'SchedulerConfig', 'StopCriteria', 'StreamerBase', 'T5EncoderModel', 'Text2ImagePipeline', 'TokenizedInputs', 'Tokenizer', 'TorchGenerator', 'UNet2DConditionModel', 'VLMDecodedResults', 'VLMPerfMetrics', 'VLMPipeline', 'VLMRawPerfMetrics', 'WhisperDecodedResultChunk', 'WhisperDecodedResults', 'WhisperGenerationConfig', 'WhisperPerfMetrics', 'WhisperPipeline', 'WhisperRawPerfMetrics', 'draft_model', 'get_version'] class Adapter: """ Immutable LoRA Adapter that carries the adaptation matrices and serves as unique adapter identifier. @@ -373,10 +373,10 @@ class ContinuousBatchingPipeline: def add_request(self, request_id: int, prompt: str, generation_config: GenerationConfig) -> GenerationHandle: ... @typing.overload - def generate(self, input_ids: list[openvino._pyopenvino.Tensor], generation_config: list[GenerationConfig], streamer: typing.Callable[[str], bool] | typing.Callable[[str], StreamerRunningStatus] | StreamerBase | None = None) -> list[EncodedGenerationResult]: + def generate(self, input_ids: list[openvino._pyopenvino.Tensor], generation_config: list[GenerationConfig], streamer: typing.Callable[[str], bool] | typing.Callable[[str], ...] | StreamerBase | None = None) -> list[EncodedGenerationResult]: ... @typing.overload - def generate(self, prompts: list[str], generation_config: list[GenerationConfig], streamer: typing.Callable[[str], bool] | typing.Callable[[str], StreamerRunningStatus] | StreamerBase | None = None) -> list[GenerationResult]: + def generate(self, prompts: list[str], generation_config: list[GenerationConfig], streamer: typing.Callable[[str], bool] | typing.Callable[[str], ...] | StreamerBase | None = None) -> list[GenerationResult]: ... def get_config(self) -> GenerationConfig: ... @@ -940,7 +940,7 @@ class LLMPipeline: """ This class is used for generation with LLMs """ - def __call__(self, inputs: openvino._pyopenvino.Tensor | TokenizedInputs | str | list[str], generation_config: GenerationConfig | None = None, streamer: typing.Callable[[str], StreamerRunningStatus] | StreamerBase | None = None, **kwargs) -> EncodedResults | DecodedResults: + def __call__(self, inputs: openvino._pyopenvino.Tensor | TokenizedInputs | str | list[str], generation_config: GenerationConfig | None = None, streamer: typing.Callable[[str], bool] | StreamerBase | None = None, **kwargs) -> EncodedResults | DecodedResults: """ Generates sequences or tokens for LLMs. If input is a string or list of strings then resulting sequences will be already detokenized. @@ -951,7 +951,7 @@ class LLMPipeline: :type generation_config: GenerationConfig or a Dict :param streamer: streamer either as a lambda with a boolean returning flag whether generation should be stopped - :type : Callable[[str], bool], Callable[[str], ov.genai.StreamerRunningStatus], ov.genai.StreamerBase + :type : Callable[[str], bool], Callable[[str], bool], ov.genai.StreamerBase :param kwargs: arbitrary keyword arguments with keys corresponding to GenerationConfig fields. :type : Dict @@ -1025,7 +1025,7 @@ class LLMPipeline: """ def finish_chat(self) -> None: ... - def generate(self, inputs: openvino._pyopenvino.Tensor | TokenizedInputs | str | list[str], generation_config: GenerationConfig | None = None, streamer: typing.Callable[[str], StreamerRunningStatus] | StreamerBase | None = None, **kwargs) -> EncodedResults | DecodedResults: + def generate(self, inputs: openvino._pyopenvino.Tensor | TokenizedInputs | str | list[str], generation_config: GenerationConfig | None = None, streamer: typing.Callable[[str], bool] | StreamerBase | None = None, **kwargs) -> EncodedResults | DecodedResults: """ Generates sequences or tokens for LLMs. If input is a string or list of strings then resulting sequences will be already detokenized. @@ -1036,7 +1036,7 @@ class LLMPipeline: :type generation_config: GenerationConfig or a Dict :param streamer: streamer either as a lambda with a boolean returning flag whether generation should be stopped - :type : Callable[[str], bool], Callable[[str], ov.genai.StreamerRunningStatus], ov.genai.StreamerBase + :type : Callable[[str], bool], Callable[[str], bool], ov.genai.StreamerBase :param kwargs: arbitrary keyword arguments with keys corresponding to GenerationConfig fields. :type : Dict @@ -1497,49 +1497,6 @@ class StreamerBase: """ Put is called every time new token is decoded. Returns a bool flag to indicate whether generation should be stopped, if return true generation stops """ -class StreamerRunningStatus: - """ - Members: - - UNDEF - - RUNNING - - STOP - - CANCEL - """ - CANCEL: typing.ClassVar[StreamerRunningStatus] # value = - RUNNING: typing.ClassVar[StreamerRunningStatus] # value = - STOP: typing.ClassVar[StreamerRunningStatus] # value = - UNDEF: typing.ClassVar[StreamerRunningStatus] # value = - __members__: typing.ClassVar[dict[str, StreamerRunningStatus]] # value = {'UNDEF': , 'RUNNING': , 'STOP': , 'CANCEL': } - def __eq__(self, other: typing.Any) -> bool: - ... - def __getstate__(self) -> int: - ... - def __hash__(self) -> int: - ... - def __index__(self) -> int: - ... - def __init__(self, value: int) -> None: - ... - def __int__(self) -> int: - ... - def __ne__(self, other: typing.Any) -> bool: - ... - def __repr__(self) -> str: - ... - def __setstate__(self, state: int) -> None: - ... - def __str__(self) -> str: - ... - @property - def name(self) -> str: - ... - @property - def value(self) -> int: - ... class T5EncoderModel: """ T5EncoderModel class. @@ -1845,7 +1802,7 @@ class VLMPipeline: def finish_chat(self) -> None: ... @typing.overload - def generate(self, prompt: str, images: list[openvino._pyopenvino.Tensor], generation_config: GenerationConfig, streamer: typing.Callable[[str], StreamerRunningStatus] | StreamerBase | None = None, **kwargs) -> VLMDecodedResults: + def generate(self, prompt: str, images: list[openvino._pyopenvino.Tensor], generation_config: GenerationConfig, streamer: typing.Callable[[str], bool] | StreamerBase | None = None, **kwargs) -> VLMDecodedResults: """ Generates sequences for VLMs. @@ -1858,8 +1815,8 @@ class VLMPipeline: :param generation_config: generation_config :type generation_config: GenerationConfig or a Dict - :param streamer: streamer either as a lambda with a StreamerRunningStatus returning flag whether generation should be stopped. Please, be aware that status CANCELLED is not supported and work as STOP. - :type : Callable[[str], ov.genai.StreamerRunningStatus], ov.genai.StreamerBase + :param streamer: streamer either as a lambda with a boolean returning flag whether generation should be stopped. + :type : Callable[[str], bool], ov.genai.StreamerBase :param kwargs: arbitrary keyword arguments with keys corresponding to GenerationConfig fields. :type : Dict @@ -1868,7 +1825,7 @@ class VLMPipeline: :rtype: VLMDecodedResults """ @typing.overload - def generate(self, prompt: str, images: openvino._pyopenvino.Tensor, generation_config: GenerationConfig, streamer: typing.Callable[[str], StreamerRunningStatus] | StreamerBase | None = None, **kwargs) -> VLMDecodedResults: + def generate(self, prompt: str, images: openvino._pyopenvino.Tensor, generation_config: GenerationConfig, streamer: typing.Callable[[str], bool] | StreamerBase | None = None, **kwargs) -> VLMDecodedResults: """ Generates sequences for VLMs. @@ -1881,8 +1838,8 @@ class VLMPipeline: :param generation_config: generation_config :type generation_config: GenerationConfig or a Dict - :param streamer: streamer either as a lambda with a StreamerRunningStatus returning flag whether generation should be stopped. Please, be aware that status CANCELLED is not supported and work as STOP. - :type : Callable[[str], ov.genai.StreamerRunningStatus], ov.genai.StreamerBase + :param streamer: streamer either as a lambda with a boolean returning flag whether generation should be stopped. + :type : Callable[[str], bool], ov.genai.StreamerBase :param kwargs: arbitrary keyword arguments with keys corresponding to GenerationConfig fields. :type : Dict @@ -1904,8 +1861,7 @@ class VLMPipeline: image: ov.Tensor - input image, images: List[ov.Tensor] - input images, generation_config: GenerationConfig, - streamer: Callable[[str], ov.genai.StreamerRunningStatus], ov.genai.StreamerBase - streamer either as a lambda with a StreamerRunningStatus returning flag whether generation should be stopped. - Please, be aware that status CANCELLED is not supported and work as STOP. + streamer: Callable[[str], bool], ov.genai.StreamerBase - streamer either as a lambda with a boolean returning flag whether generation should be stopped. :return: return results in decoded form :rtype: VLMDecodedResults diff --git a/src/python/py_llm_pipeline.cpp b/src/python/py_llm_pipeline.cpp index 301883abf0..fb64eabd74 100644 --- a/src/python/py_llm_pipeline.cpp +++ b/src/python/py_llm_pipeline.cpp @@ -37,7 +37,7 @@ auto generate_docstring = R"( :type generation_config: GenerationConfig or a Dict :param streamer: streamer either as a lambda with a boolean returning flag whether generation should be stopped - :type : Callable[[str], bool], Callable[[str], ov.genai.StreamerRunningStatus], ov.genai.StreamerBase + :type : Callable[[str], bool], Callable[[str], bool], ov.genai.StreamerBase :param kwargs: arbitrary keyword arguments with keys corresponding to GenerationConfig fields. :type : Dict diff --git a/src/python/py_openvino_genai.cpp b/src/python/py_openvino_genai.cpp index 29bc5caae8..f8e577d5c8 100644 --- a/src/python/py_openvino_genai.cpp +++ b/src/python/py_openvino_genai.cpp @@ -118,12 +118,6 @@ PYBIND11_MODULE(py_openvino_genai, m) { .def("put", &StreamerBase::put, "Put is called every time new token is decoded. Returns a bool flag to indicate whether generation should be stopped, if return true generation stops", py::arg("token")) .def("end", &StreamerBase::end, "End is called at the end of generation. It can be used to flush cache if your own streamer has one"); - py::enum_(m, "StreamerRunningStatus") - .value("UNDEF", ov::genai::StreamerRunningStatus::UNDEF) - .value("RUNNING", ov::genai::StreamerRunningStatus::RUNNING) - .value("STOP", ov::genai::StreamerRunningStatus::STOP) - .value("CANCEL", ov::genai::StreamerRunningStatus::CANCEL); - init_tokenizer(m); init_lora_adapter(m); init_generation_config(m); diff --git a/src/python/py_utils.cpp b/src/python/py_utils.cpp index 52ce9df3bb..1fc34a36d2 100644 --- a/src/python/py_utils.cpp +++ b/src/python/py_utils.cpp @@ -336,10 +336,10 @@ ov::genai::StreamerVariant pystreamer_to_streamer(const PyBindStreamerVariant& p ov::genai::StreamerVariant streamer = std::monostate(); std::visit(overloaded { - [&streamer](const std::function& py_callback){ + [&streamer](const std::function& py_callback){ // Wrap python streamer with manual utf-8 decoding. Do not rely // on pybind automatic decoding since it raises exceptions on incomplete strings. - auto callback_wrapped = [py_callback](std::string subword) -> ov::genai::StreamerRunningStatus { + auto callback_wrapped = [py_callback](std::string subword) -> bool { auto py_str = PyUnicode_DecodeUTF8(subword.data(), subword.length(), "replace"); return py_callback(py::reinterpret_borrow(py_str)); }; diff --git a/src/python/py_utils.hpp b/src/python/py_utils.hpp index f085a5c922..9d78ab0930 100644 --- a/src/python/py_utils.hpp +++ b/src/python/py_utils.hpp @@ -12,13 +12,12 @@ namespace py = pybind11; using ov::genai::StreamerBase; -using ov::genai::StreamerRunningStatus; namespace ov::genai::pybind::utils { // When StreamerVariant is used utf-8 decoding is done by pybind and can lead to exception on incomplete texts. // Therefore strings decoding should be handled with PyUnicode_DecodeUTF8(..., "replace") to not throw errors. -using PyBindStreamerVariant = std::variant, std::shared_ptr, std::monostate>; +using PyBindStreamerVariant = std::variant, std::shared_ptr, std::monostate>; template struct overloaded : Ts... { diff --git a/src/python/py_vlm_pipeline.cpp b/src/python/py_vlm_pipeline.cpp index 9ebf3f2df5..e8860065c4 100644 --- a/src/python/py_vlm_pipeline.cpp +++ b/src/python/py_vlm_pipeline.cpp @@ -31,8 +31,8 @@ auto vlm_generate_docstring = R"( :param generation_config: generation_config :type generation_config: GenerationConfig or a Dict - :param streamer: streamer either as a lambda with a StreamerRunningStatus returning flag whether generation should be stopped. Please, be aware that status CANCELLED is not supported and work as STOP. - :type : Callable[[str], ov.genai.StreamerRunningStatus], ov.genai.StreamerBase + :param streamer: streamer either as a lambda with a boolean returning flag whether generation should be stopped. + :type : Callable[[str], bool], ov.genai.StreamerBase :param kwargs: arbitrary keyword arguments with keys corresponding to GenerationConfig fields. :type : Dict @@ -53,8 +53,7 @@ auto vlm_generate_kwargs_docstring = R"( image: ov.Tensor - input image, images: List[ov.Tensor] - input images, generation_config: GenerationConfig, - streamer: Callable[[str], ov.genai.StreamerRunningStatus], ov.genai.StreamerBase - streamer either as a lambda with a StreamerRunningStatus returning flag whether generation should be stopped. - Please, be aware that status CANCELLED is not supported and work as STOP. + streamer: Callable[[str], bool], ov.genai.StreamerBase - streamer either as a lambda with a boolean returning flag whether generation should be stopped. :return: return results in decoded form :rtype: VLMDecodedResults