diff --git a/llm_bench/python/benchmark.py b/llm_bench/python/benchmark.py index 9b92240c16..6cc1e34e9e 100644 --- a/llm_bench/python/benchmark.py +++ b/llm_bench/python/benchmark.py @@ -188,12 +188,12 @@ def run_text_generation_genai(input_text, num, model, tokenizer, args, iter_data for bs_index, in_text in enumerate(input_text_list): utils.output_file.output_input_text(in_text, args, model_precision, prompt_index, bs_index, proc_id) tok_encode_start = time.perf_counter() - input_data = tokenizer(input_text_list, return_tensors='pt') + input_data = tokenizer.encode(input_text_list) tok_encode_end = time.perf_counter() tok_encode_time = (tok_encode_end - tok_encode_start) * 1000 # Remove `token_type_ids` from inputs - input_tokens = input_data['input_ids'] if 'input_ids' in input_data else input_data - input_token_size = input_tokens[0].numel() + input_tokens = input_data.input_ids.data + input_token_size = input_tokens[0].size if args['batch_size'] > 1: out_str = '[warm-up]' if num == 0 else '[{}]'.format(num) out_str += " Batch_size={}, ".format(args['batch_size']) @@ -209,25 +209,24 @@ def run_text_generation_genai(input_text, num, model, tokenizer, args, iter_data max_gen_tokens = DEFAULT_OUTPUT_TOKEN_SIZE if args['infer_count'] is None else args['infer_count'] streamer.reset() start = time.perf_counter() - generated_text = model.generate(input_text_list, max_new_tokens=max_gen_tokens, num_beams=args["num_beams"], streamer=streamer).texts + generated_tokens = model.generate(input_data, max_new_tokens=max_gen_tokens, num_beams=args["num_beams"], streamer=streamer).tokens end = time.perf_counter() + log.info(type(generated_tokens[0])) if (args['mem_consumption'] == 1 and num == 0) or args['mem_consumption'] == 2: mem_consumption.end_collect_momory_consumption() max_rss_mem_consumption, max_shared_mem_consumption = mem_consumption.get_max_memory_consumption() mem_consumption.clear_max_memory_consumption() generation_time = end - start - - result = [streamer.get_tokens()] tok_decode_start = time.perf_counter() - _ = tokenizer.batch_decode(np.array(result, dtype=int)) + generated_text = tokenizer.decode(generated_tokens) tok_decode_end = time.perf_counter() tok_decode_time = (tok_decode_end - tok_decode_start) * 1000 # Only text_gen need to minus length of input_data, because generated_text may include input_text num_tokens = 0 result_md5_list = [] for bs_idx in range(args['batch_size']): - generated_text_len = len(result[bs_idx]) + generated_text_len = len(generated_tokens[bs_idx]) num_tokens += generated_text_len if generated_text_len > max_gen_tokens: log.error('Output token size is over max output token size!') diff --git a/llm_bench/python/utils/ov_utils.py b/llm_bench/python/utils/ov_utils.py index caeb6bbb14..2c192ce65e 100644 --- a/llm_bench/python/utils/ov_utils.py +++ b/llm_bench/python/utils/ov_utils.py @@ -181,11 +181,10 @@ def create_text_gen_model(model_path, device, **kwargs): def create_genai_text_gen_model(model_path, device, ov_config, **kwargs): import openvino_tokenizers # noqa: F401 import openvino_genai - from transformers import AutoTokenizer class TokenStreamer(openvino_genai.StreamerBase): def __init__(self, tokenizer): - super().__init__() + openvino_genai.StreamerBase.__init__(self) self.tokenizer = tokenizer self.token_generation_time = [] self.generated_tokens = [] @@ -214,21 +213,15 @@ def get_time_list(self): if not (model_path / "openvino_tokenizer.xml").exists() or not (model_path / "openvino_detokenizer.xml").exists(): convert_ov_tokenizer(model_path) - core = Core() - hf_tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) - ov_tok = core.read_model(model_path / "openvino_tokenizer.xml") - ov_detok = core.read_model(model_path / "openvino_detokenizer.xml") - hf_tokenizer = build_ov_tokenizer_wrapper(hf_tokenizer, ov_tok, ov_detok) - start = time.perf_counter() - # TO DO: add plugin config - llm_pipe = openvino_genai.LLMPipeline(str(model_path), device.upper()) + llm_pipe = openvino_genai.LLMPipeline(str(model_path), device.upper(), ov_config) end = time.perf_counter() log.info(f'Pipeline initialization time: {end - start:.2f}s') - streamer = TokenStreamer(llm_pipe.get_tokenizer()) + tokenizer = llm_pipe.get_tokenizer() + streamer = TokenStreamer(tokenizer) - return llm_pipe, hf_tokenizer, end - start, streamer, True + return llm_pipe, tokenizer, end - start, streamer, True def convert_ov_tokenizer(tokenizer_path):