Skip to content

Commit

Permalink
device config and external token/detok for genai llm bench (#514)
Browse files Browse the repository at this point in the history
  • Loading branch information
eaidova authored Jun 14, 2024
1 parent acae7d7 commit e17b09a
Show file tree
Hide file tree
Showing 2 changed files with 12 additions and 20 deletions.
15 changes: 7 additions & 8 deletions llm_bench/python/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,12 +188,12 @@ def run_text_generation_genai(input_text, num, model, tokenizer, args, iter_data
for bs_index, in_text in enumerate(input_text_list):
utils.output_file.output_input_text(in_text, args, model_precision, prompt_index, bs_index, proc_id)
tok_encode_start = time.perf_counter()
input_data = tokenizer(input_text_list, return_tensors='pt')
input_data = tokenizer.encode(input_text_list)
tok_encode_end = time.perf_counter()
tok_encode_time = (tok_encode_end - tok_encode_start) * 1000
# Remove `token_type_ids` from inputs
input_tokens = input_data['input_ids'] if 'input_ids' in input_data else input_data
input_token_size = input_tokens[0].numel()
input_tokens = input_data.input_ids.data
input_token_size = input_tokens[0].size
if args['batch_size'] > 1:
out_str = '[warm-up]' if num == 0 else '[{}]'.format(num)
out_str += " Batch_size={}, ".format(args['batch_size'])
Expand All @@ -209,25 +209,24 @@ def run_text_generation_genai(input_text, num, model, tokenizer, args, iter_data
max_gen_tokens = DEFAULT_OUTPUT_TOKEN_SIZE if args['infer_count'] is None else args['infer_count']
streamer.reset()
start = time.perf_counter()
generated_text = model.generate(input_text_list, max_new_tokens=max_gen_tokens, num_beams=args["num_beams"], streamer=streamer).texts
generated_tokens = model.generate(input_data, max_new_tokens=max_gen_tokens, num_beams=args["num_beams"], streamer=streamer).tokens
end = time.perf_counter()
log.info(type(generated_tokens[0]))
if (args['mem_consumption'] == 1 and num == 0) or args['mem_consumption'] == 2:
mem_consumption.end_collect_momory_consumption()
max_rss_mem_consumption, max_shared_mem_consumption = mem_consumption.get_max_memory_consumption()
mem_consumption.clear_max_memory_consumption()

generation_time = end - start

result = [streamer.get_tokens()]
tok_decode_start = time.perf_counter()
_ = tokenizer.batch_decode(np.array(result, dtype=int))
generated_text = tokenizer.decode(generated_tokens)
tok_decode_end = time.perf_counter()
tok_decode_time = (tok_decode_end - tok_decode_start) * 1000
# Only text_gen need to minus length of input_data, because generated_text may include input_text
num_tokens = 0
result_md5_list = []
for bs_idx in range(args['batch_size']):
generated_text_len = len(result[bs_idx])
generated_text_len = len(generated_tokens[bs_idx])
num_tokens += generated_text_len
if generated_text_len > max_gen_tokens:
log.error('Output token size is over max output token size!')
Expand Down
17 changes: 5 additions & 12 deletions llm_bench/python/utils/ov_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,11 +181,10 @@ def create_text_gen_model(model_path, device, **kwargs):
def create_genai_text_gen_model(model_path, device, ov_config, **kwargs):
import openvino_tokenizers # noqa: F401
import openvino_genai
from transformers import AutoTokenizer

class TokenStreamer(openvino_genai.StreamerBase):
def __init__(self, tokenizer):
super().__init__()
openvino_genai.StreamerBase.__init__(self)
self.tokenizer = tokenizer
self.token_generation_time = []
self.generated_tokens = []
Expand Down Expand Up @@ -214,21 +213,15 @@ def get_time_list(self):
if not (model_path / "openvino_tokenizer.xml").exists() or not (model_path / "openvino_detokenizer.xml").exists():
convert_ov_tokenizer(model_path)

core = Core()
hf_tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
ov_tok = core.read_model(model_path / "openvino_tokenizer.xml")
ov_detok = core.read_model(model_path / "openvino_detokenizer.xml")
hf_tokenizer = build_ov_tokenizer_wrapper(hf_tokenizer, ov_tok, ov_detok)

start = time.perf_counter()

# TO DO: add plugin config
llm_pipe = openvino_genai.LLMPipeline(str(model_path), device.upper())
llm_pipe = openvino_genai.LLMPipeline(str(model_path), device.upper(), ov_config)
end = time.perf_counter()
log.info(f'Pipeline initialization time: {end - start:.2f}s')
streamer = TokenStreamer(llm_pipe.get_tokenizer())
tokenizer = llm_pipe.get_tokenizer()
streamer = TokenStreamer(tokenizer)

return llm_pipe, hf_tokenizer, end - start, streamer, True
return llm_pipe, tokenizer, end - start, streamer, True


def convert_ov_tokenizer(tokenizer_path):
Expand Down

0 comments on commit e17b09a

Please sign in to comment.