From 481a043d38f59d1b122df999ad965020238a9a37 Mon Sep 17 00:00:00 2001 From: TolyaTalamanov Date: Thu, 2 Jan 2025 14:52:04 +0000 Subject: [PATCH] Revert changes in sampler --- src/cpp/src/llm_pipeline_static.cpp | 8 +- src/cpp/src/sampler.cpp | 131 ++++++++++++++-------------- src/cpp/src/sampler.hpp | 21 ----- 3 files changed, 68 insertions(+), 92 deletions(-) diff --git a/src/cpp/src/llm_pipeline_static.cpp b/src/cpp/src/llm_pipeline_static.cpp index aac55015c0..06dceb64a6 100644 --- a/src/cpp/src/llm_pipeline_static.cpp +++ b/src/cpp/src/llm_pipeline_static.cpp @@ -635,10 +635,10 @@ void copy_columns_by_row_chunks(const ov::Tensor& src, ov::Tensor& dst) { } } -void stream_generated_tokens(std::shared_ptr streamer_ptr, - GenerationHandle& handle) { +void stream_generated_tokens(std::shared_ptr streamer_ptr, + ov::genai::GenerationHandle& handle) { if (streamer_ptr && handle->can_read()) { - std::unordered_map token = handle->back(); + std::unordered_map token = handle->back(); for (const auto& gen_token : token.begin()->second.generated_ids) { if (streamer_ptr->put(gen_token)) { handle->drop(); @@ -648,7 +648,7 @@ void stream_generated_tokens(std::shared_ptr streamer_ptr, } } -int64_t get_last_token(SequenceGroup::Ptr sequence_group) { +int64_t get_last_token(ov::genai::SequenceGroup::Ptr sequence_group) { const auto running_sequences = sequence_group->get_running_sequences(); OPENVINO_ASSERT(running_sequences.size() == 1u); const auto sequence = running_sequences.front(); diff --git a/src/cpp/src/sampler.cpp b/src/cpp/src/sampler.cpp index 73a406c695..6498a7d4c4 100644 --- a/src/cpp/src/sampler.cpp +++ b/src/cpp/src/sampler.cpp @@ -67,71 +67,6 @@ std::vector log_softmax(const ov::Tensor& logits, size_t batch_idx) { return tokens; } -Token greedy_sample(const Logits& logits, size_t top_logprobs) { - // For greedy sampling we do not expect sorting or shrinking considered tokens - // so we can operate directly on the data buffer - size_t m = std::max(size_t(1), top_logprobs); // ensure m is at least 1 - std::vector top_values(m, -std::numeric_limits::infinity()); - std::vector top_indexes(m, 0); - - for (size_t i = 0; i < logits.m_size; ++i) { - if (logits.m_data[i] > top_values.back()) { - top_values.back() = logits.m_data[i]; - top_indexes.back() = i; - - for (size_t j = top_values.size() - 1; j > 0 && top_values[j] > top_values[j - 1]; --j) { - std::swap(top_values[j], top_values[j - 1]); - std::swap(top_indexes[j], top_indexes[j - 1]); - } - } - } - - size_t max_index = top_indexes.front(); - float max_value = 0.0; - - if (top_logprobs) { - // apply log softmax to max value - max_value = top_values.front(); - float log_sum = std::log(std::accumulate( - logits.m_data, logits.m_data + logits.m_size, 0.0f, [max_value](float accumulated, float to_add) { - return accumulated + std::exp(to_add - max_value); - })); - max_value = -log_sum; - } - - return Token(max_value, max_index); -} - -std::vector multinomial_sample(const Logits& logits, - size_t num_tokens_per_sequence, - std::mt19937& rng_engine) { - // If top_p or top_k was applied we use sorted vector, if not we go with original buffer. - std::vector multinomial_weights; - multinomial_weights.reserve(logits.m_size); - if (logits.is_vector_initialized()) - for (auto& logit: logits.m_vector) multinomial_weights.emplace_back(logit.m_log_prob); - else - multinomial_weights.assign(logits.m_data, logits.m_data + logits.m_size); - - // std::discrete_distribution returns corrupted results when applied to log probabilities - // which result returning NAN only logprobs. - // so log() is applied after this line - auto dist = std::discrete_distribution(multinomial_weights.begin(), multinomial_weights.end()); // equivalent to multinomial with number of trials == 1 - - std::vector out_tokens; - for (size_t token_idx = 0; token_idx < num_tokens_per_sequence; ++token_idx) { - size_t element_to_pick = dist(rng_engine); - if (logits.is_vector_initialized()) { - auto logit = logits.m_vector[element_to_pick]; - logit.m_log_prob = std::log(logit.m_log_prob); - out_tokens.push_back(logit); - } - else - out_tokens.emplace_back(std::log(logits.m_data[element_to_pick]), element_to_pick); - } - return out_tokens; -} - std::vector wrap_tokens(const std::vector& tokens, const std::vector& prefix_tokens, const std::vector& suffix_tokens) { std::vector all_tokens = prefix_tokens; all_tokens.insert(all_tokens.end(), tokens.begin(), tokens.end()); @@ -160,6 +95,13 @@ std::vector encode_and_process_string(const std::string& stop_string, o return encoded_stop_string; } +struct MatchStopStringResult { + size_t to_remove = 0; + // int64_t last_token_id = 0; + // bool is_to_update_last_token = false; + bool is_matched = false; +}; + // Return number of last tokens that match one of the stop_strings. If there's no match 0 is returned. MatchStopStringResult match_stop_string(Tokenizer& tokenizer, const TokenIds& generated_tokens, @@ -539,11 +481,66 @@ Logits Sampler::_get_logit_vector(ov::Tensor logits, size_t batch_idx, size_t to } Token Sampler::_greedy_sample(const Logits& logits, size_t top_logprobs) const { - return greedy_sample(logits, top_logprobs); + // For greedy sampling we do not expect sorting or shrinking considered tokens + // so we can operate directly on the data buffer + size_t m = std::max(size_t(1), top_logprobs); // ensure m is at least 1 + std::vector top_values(m, -std::numeric_limits::infinity()); + std::vector top_indexes(m, 0); + + for (size_t i = 0; i < logits.m_size; ++i) { + if (logits.m_data[i] > top_values.back()) { + top_values.back() = logits.m_data[i]; + top_indexes.back() = i; + + for (size_t j = top_values.size() - 1; j > 0 && top_values[j] > top_values[j - 1]; --j) { + std::swap(top_values[j], top_values[j - 1]); + std::swap(top_indexes[j], top_indexes[j - 1]); + } + } + } + + size_t max_index = top_indexes.front(); + float max_value = 0.0; + + if (top_logprobs) { + // apply log softmax to max value + max_value = top_values.front(); + float log_sum = std::log(std::accumulate( + logits.m_data, logits.m_data + logits.m_size, 0.0f, [max_value](float accumulated, float to_add) { + return accumulated + std::exp(to_add - max_value); + })); + max_value = -log_sum; + } + + return Token(max_value, max_index); } std::vector Sampler::_multinomial_sample(const Logits& logits, size_t num_tokens_per_sequence) { - return multinomial_sample(logits, num_tokens_per_sequence, rng_engine); + // If top_p or top_k was applied we use sorted vector, if not we go with original buffer. + std::vector multinomial_weights; + multinomial_weights.reserve(logits.m_size); + if (logits.is_vector_initialized()) + for (auto& logit: logits.m_vector) multinomial_weights.emplace_back(logit.m_log_prob); + else + multinomial_weights.assign(logits.m_data, logits.m_data + logits.m_size); + + // std::discrete_distribution returns corrupted results when applied to log probabilities + // which result returning NAN only logprobs. + // so log() is applied after this line + auto dist = std::discrete_distribution(multinomial_weights.begin(), multinomial_weights.end()); // equivalent to multinomial with number of trials == 1 + + std::vector out_tokens; + for (size_t token_idx = 0; token_idx < num_tokens_per_sequence; ++token_idx) { + size_t element_to_pick = dist(rng_engine); + if (logits.is_vector_initialized()) { + auto logit = logits.m_vector[element_to_pick]; + logit.m_log_prob = std::log(logit.m_log_prob); + out_tokens.push_back(logit); + } + else + out_tokens.emplace_back(std::log(logits.m_data[element_to_pick]), element_to_pick); + } + return out_tokens; } std::vector Sampler::_try_finish_generation(SequenceGroup::Ptr & sequence_group) { diff --git a/src/cpp/src/sampler.hpp b/src/cpp/src/sampler.hpp index df0c406749..7796f93d1e 100644 --- a/src/cpp/src/sampler.hpp +++ b/src/cpp/src/sampler.hpp @@ -32,27 +32,6 @@ inline bool is_stop_token_id_hit(int64_t generated_token, const std::set log_softmax(const ov::Tensor& logits, size_t batch_idx); -Token greedy_sample(const Logits& logits, size_t top_logprobs); - -std::vector multinomial_sample(const Logits& logits, - size_t num_tokens_per_sequence, - std::mt19937& rng_engine); - -std::pair> -process_stop_strings(const std::set& stop_strings, Tokenizer& tokenizer); - -struct MatchStopStringResult { - size_t to_remove = 0; - // int64_t last_token_id = 0; - // bool is_to_update_last_token = false; - bool is_matched = false; -}; - -MatchStopStringResult match_stop_string(Tokenizer& tokenizer, - const TokenIds& generated_tokens, - const std::pair>& stop_strings, - bool is_include_to_output); - struct SamplerOutput { // IDs of sequences that need to be dropped std::vector m_dropped_sequences;