From ffdad03f90b29d2cc28a1004bd7446ddc2d866e2 Mon Sep 17 00:00:00 2001 From: xufang Date: Fri, 3 Jan 2025 14:40:20 +0800 Subject: [PATCH] add perf printing for prompt lookup decoding --- .../prompt_lookup_decoding_lm.cpp | 8 +++++-- .../src/prompt_lookup/prompt_lookup_impl.cpp | 21 +++++++++++++++++++ .../speculative_decoding_impl.cpp | 4 ++++ 3 files changed, 31 insertions(+), 2 deletions(-) diff --git a/samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.cpp b/samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.cpp index 8b48dbade0..cc4f8c0973 100644 --- a/samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.cpp +++ b/samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.cpp @@ -34,8 +34,12 @@ int main(int argc, char* argv[]) try { // Since the streamer is set, the results will // be printed each time a new token is generated. - pipe.generate(prompt, config, streamer); - std::cout << std::endl; + int iter = 0; + while (iter < 10) { + pipe.generate(prompt, config, streamer); + iter++; + std::cout << "\npipeline finish iter:" << iter << std::endl; + } } catch (const std::exception& error) { try { std::cerr << error.what() << '\n'; diff --git a/src/cpp/src/prompt_lookup/prompt_lookup_impl.cpp b/src/cpp/src/prompt_lookup/prompt_lookup_impl.cpp index f934a56939..11287df6e6 100644 --- a/src/cpp/src/prompt_lookup/prompt_lookup_impl.cpp +++ b/src/cpp/src/prompt_lookup/prompt_lookup_impl.cpp @@ -64,6 +64,7 @@ void ContinuousBatchingPipeline::PromptLookupImpl::step() { } if (generated_len_after.empty() && 0) { + m_pipeline->get_infer_duration(m_sd_metrics.main_infer_duration, m_sd_metrics.main_infer_num); m_sd_metrics.print(true); m_sd_metrics.clean_up(); } @@ -103,14 +104,25 @@ ContinuousBatchingPipeline::PromptLookupImpl::generate(const std::vectorreset_infer_duration(); while (has_non_finished_requests() && continue_generation) { + ManualTimer step_timer("speculative_decoding: step()"); + step_timer.start(); step(); + step_timer.end(); + first_token_time += step_timer.get_duration(); if (streamer_ptr) { // not generated tokens like several prompt phase if (!main_generations.at(0).get()->can_read()) { continue; } std::unordered_map token = main_generations.at(0).get()->back(); + if (!get_first_token && !token.begin()->second.generated_ids.empty()) { + first_tokens_num = token.begin()->second.generated_ids.size(); + } OPENVINO_ASSERT(1 <= token.size()); OPENVINO_ASSERT(1 <= token.begin()->second.generated_ids.size()); for (const auto& gen_token : token.begin()->second.generated_ids) { @@ -120,6 +132,12 @@ ContinuousBatchingPipeline::PromptLookupImpl::generate(const std::vector 0) { + get_first_token = true; + m_sd_metrics.first_token_duration = first_token_time; + int number = 0; + m_pipeline->get_infer_duration(m_sd_metrics.main_infer_for_first_token, number); + } } if (streamer_ptr) { streamer_ptr->end(); @@ -148,6 +166,9 @@ ContinuousBatchingPipeline::PromptLookupImpl::generate(const std::vectorget_infer_duration(m_sd_metrics.main_infer_duration, m_sd_metrics.main_infer_num); + m_sd_metrics.print(true); + m_sd_metrics.clean_up(); return results; } diff --git a/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp b/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp index a93338f273..a904048b37 100644 --- a/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp +++ b/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp @@ -296,6 +296,10 @@ ContinuousBatchingPipeline::SpeculativeDecodingImpl::generate(const std::vector< OPENVINO_ASSERT(results.size() == input_ids.size()); generate_timer.end(); + m_draft_pipeline->get_infer_duration(m_sd_metrics.draft_infer_duration, m_sd_metrics.draft_infer_num); + m_main_pipeline->get_infer_duration(m_sd_metrics.main_infer_duration, m_sd_metrics.main_infer_num); + m_sd_metrics.print(true); + m_sd_metrics.clean_up(); return results; }