Skip to content

Commit

Permalink
Refactor and add docs
Browse files Browse the repository at this point in the history
  • Loading branch information
vshampor committed Nov 12, 2024
1 parent f5aef77 commit 5b61636
Show file tree
Hide file tree
Showing 6 changed files with 140 additions and 95 deletions.
70 changes: 70 additions & 0 deletions src/cpp/src/cache_eviction.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -267,4 +267,74 @@ namespace ov::genai {
m_scores[decoder_layer_idx] = new_scores;
m_cache_counter[decoder_layer_idx] = new_counter;
}

CacheRotationCalculator::CacheRotationCalculator(size_t block_size, size_t max_context_length, size_t kv_head_size, double rope_theta) : m_block_size(block_size), m_head_size(kv_head_size) {
// Frequencies follow the original recipe from RoFormer:
// https://arxiv.org/pdf/2104.09864v5
//
// However, the way the rotation coefficients are ultimately applied in Llama and related models from huggingface is very different
// from the RoFormer - the embedding-dimension coefficients are not treated as consecutive x-y coordinate pairs, but are rather
// divided into contiguous x-like and y-like halves - see `rotate_half` function in HF transformers. It can be shown that this form
// still preserves the relative positioning property from the RoFormer article.
OPENVINO_ASSERT(rope_theta > 0, "rope_theta must be positive");
size_t max_position_angle_multiplier = max_context_length;
size_t num_freqs = kv_head_size / 2;
m_rope_sin_lut.resize(max_position_angle_multiplier);
m_rope_cos_lut.resize(max_position_angle_multiplier);

for (size_t i = 0; i < max_position_angle_multiplier; i++) {
m_rope_sin_lut[i].reserve(num_freqs);
m_rope_cos_lut[i].reserve(num_freqs);
for (size_t j = 0; j < num_freqs; j++) {
double exponent = - static_cast<double>(2 * j) / kv_head_size;
double base_angle = std::pow(rope_theta, exponent);
m_rope_sin_lut[i].push_back(-std::sin(i * base_angle)); // minus since we will be rotating by an inverse angle
m_rope_cos_lut[i].push_back(std::cos(i * base_angle));
}
}
}

std::vector<CacheRotationCalculator::BlockRotationData> CacheRotationCalculator::get_rotation_coefficients(const std::set<size_t>& evicted_block_logical_indices, size_t num_logical_blocks_before_eviction) {
OPENVINO_ASSERT(num_logical_blocks_before_eviction * m_block_size < m_rope_sin_lut.size(),
"num_logical_blocks_before_eviction may not correspond to less tokens than max_context_length");

std::vector<BlockRotationData> retval;
if (evicted_block_logical_indices.empty()) {
return retval;
}

for (auto idx : evicted_block_logical_indices) {
OPENVINO_ASSERT(idx < num_logical_blocks_before_eviction);
}

// num_logical_blocks_before_eviction > evicted_block_logical_indices.size() is automatically guaranteed by the
// set property and the previous assertion
retval.reserve(num_logical_blocks_before_eviction - evicted_block_logical_indices.size());

ptrdiff_t current_rotation_delta_in_blocks = 0;
std::vector<size_t> logical_block_space(num_logical_blocks_before_eviction);
std::iota(logical_block_space.begin(), logical_block_space.end(), 0);

for (size_t logical_block_idx : logical_block_space) {
if (evicted_block_logical_indices.find(logical_block_idx) != evicted_block_logical_indices.end()) {
current_rotation_delta_in_blocks += 1;
}
else {
if (current_rotation_delta_in_blocks != 0) {
BlockRotationData block_rotation_data;
block_rotation_data.logical_block_idx = logical_block_idx - current_rotation_delta_in_blocks;
block_rotation_data.cosines.reserve(m_block_size);
block_rotation_data.sines.reserve(m_block_size);
for (size_t i = 0; i < m_block_size; i++) {
block_rotation_data.cosines.push_back(m_rope_cos_lut[current_rotation_delta_in_blocks * m_block_size]);
block_rotation_data.sines.push_back(m_rope_sin_lut[current_rotation_delta_in_blocks * m_block_size]);
}

retval.push_back(block_rotation_data);
}
}
}

return retval;
}
}
113 changes: 42 additions & 71 deletions src/cpp/src/cache_eviction.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -117,88 +117,59 @@ class CacheEvictionAlgorithm {
std::vector<std::vector<size_t>> m_cache_counter;
};

/**
* @brief Computes, based on the logical indices of the blocks to be evicted, the rotation coefficients for the
* remaining cache blocks.
*
* The rotation assumes that the executed model applies rotary positional embedding (RoPE) during the execution of
* the attention operation. Each cache block therefore has the RoPE values already "baked in", with positions equivalent
* to the point in time when the cache block values were originally computed in one of the previous attention operations.
* When blocks are evicted, the logical index space of the remaining blocks is in general no longer contiguous with respect to
* the effective positions of tokens in the blocks. Cache rotation allows to remedy this by effectively adjusting the RoPE positions
* of certain blocks in the cache after eviction, by additionally "rotating" them (in the same sense as in RoPE) by such angles that
* the cache blocks in the logical index space are again contiguous in terms of the RoPE positions. This is supposed to make the
* eviction process less impactful on the accuracy of the generation.
*
* Currently only the basic RoPE method is supported (as applied in the Llama original models). Each model in general may have
* its own RoPE method (e.g. non-linear/NTK frequency scaling), and ideally the cache rotation calculator should be adjusted based on
* the specifics of the RoPE defined by the LLM.
*/
class CacheRotationCalculator {
public:
CacheRotationCalculator(size_t block_size, size_t max_context_length, size_t kv_head_size, double rope_theta = 10000.0f) : m_block_size(block_size), m_head_size(kv_head_size) {
// Frequencies follow the original recipe from RoFormer:
// https://arxiv.org/pdf/2104.09864v5
//
// However, the way the rotation coefficients are ultimately applied in Llama and related models from huggingface is very different
// from the RoFormer - the embedding-dimension coefficients are not treated as consecutive x-y coordinate pairs, but are rather
// divided into contiguous x-like and y-like halves - see `rotate_half` function in HF transformers. It can be shown that this form
// still preserves the relative positioning property from the RoFormer article.
OPENVINO_ASSERT(rope_theta > 0, "rope_theta must be positive");
size_t max_position_angle_multiplier = max_context_length;
size_t num_freqs = kv_head_size / 2;
m_rope_sin_lut.resize(max_position_angle_multiplier);
m_rope_cos_lut.resize(max_position_angle_multiplier);

for (size_t i = 0; i < max_position_angle_multiplier; i++) {
m_rope_sin_lut[i].reserve(num_freqs);
m_rope_cos_lut[i].reserve(num_freqs);
for (size_t j = 0; j < num_freqs; j++) {
double exponent = - static_cast<double>(2 * j) / kv_head_size;
double base_angle = std::pow(rope_theta, exponent);
m_rope_sin_lut[i].push_back(-std::sin(i * base_angle)); // minus since we will be rotating by an inverse angle
m_rope_cos_lut[i].push_back(std::cos(i * base_angle));
}
}
};
/**
* Constructs a CacheRotationCalculator.
* @param block_size Block size of the KV cache to evict from.
* @param max_context_length Maximum length possible for a sequence in the current pipeline.
* @param kv_head_size The size (in elements) of the embedding dimension in the attention operation.
* @param rope_theta The base RoPE angle used in the original LLM.
*/
CacheRotationCalculator(size_t block_size, size_t max_context_length, size_t kv_head_size, double rope_theta = 10000.0f);

using RotationCoefficientsPerToken = std::vector<std::vector<double>>; // dimensions: [BLOCK_SIZE, head_size / 2]

/**
* Basic output structure for the calculator.
*/
struct BlockRotationData {
bool operator==(const BlockRotationData& rhs) const {
return (logical_block_idx == rhs.logical_block_idx) && (sines == rhs.sines) && (cosines == rhs.cosines);
}
size_t logical_block_idx; // **NOTE**: corresponds to logical index AFTER eviction
RotationCoefficientsPerToken sines;
RotationCoefficientsPerToken cosines;
size_t logical_block_idx; /** Logical index of the block AFTER eviction to which the sine and cosine coefficients should be applied */
RotationCoefficientsPerToken sines; /** The sine coefficients to be applied to this block's contents for rotation, in order of the block's elements */
RotationCoefficientsPerToken cosines; /** The cosine coefficients to be applied to this block's contents for rotation, in order of the block's elements */
};

std::vector<BlockRotationData> get_rotation_multipliers(const std::set<size_t>& evicted_block_logical_indices, size_t num_logical_blocks_before_eviction) {
OPENVINO_ASSERT(num_logical_blocks_before_eviction * m_block_size < m_rope_sin_lut.size(),
"num_logical_blocks_before_eviction may not correspond to less tokens than max_context_length");

std::vector<BlockRotationData> retval;
if (evicted_block_logical_indices.empty()) {
return retval;
}

for (auto idx : evicted_block_logical_indices) {
OPENVINO_ASSERT(idx < num_logical_blocks_before_eviction);
}

// num_logical_blocks_before_eviction > evicted_block_logical_indices.size() is automatically guaranteed by the
// set property and the previous assertion
retval.reserve(num_logical_blocks_before_eviction - evicted_block_logical_indices.size());

ptrdiff_t current_rotation_delta_in_blocks = 0;
std::vector<size_t> logical_block_space(num_logical_blocks_before_eviction);
std::iota(logical_block_space.begin(), logical_block_space.end(), 0);

for (size_t logical_block_idx : logical_block_space) {
if (evicted_block_logical_indices.find(logical_block_idx) != evicted_block_logical_indices.end()) {
current_rotation_delta_in_blocks += 1;
}
else {
if (current_rotation_delta_in_blocks != 0) {
BlockRotationData block_rotation_data;
block_rotation_data.logical_block_idx = logical_block_idx - current_rotation_delta_in_blocks;
block_rotation_data.cosines.reserve(m_block_size);
block_rotation_data.sines.reserve(m_block_size);
for (size_t i = 0; i < m_block_size; i++) {
block_rotation_data.cosines.push_back(m_rope_cos_lut[current_rotation_delta_in_blocks * m_block_size]);
block_rotation_data.sines.push_back(m_rope_sin_lut[current_rotation_delta_in_blocks * m_block_size]);
}

retval.push_back(block_rotation_data);
}
}
}

return retval;
}
/**
* Computes the rotation coefficients for the given state of the logical block space when eviction is about to take place.
* @param evicted_block_logical_indices The logical block indices that the prior cache eviction algorithm step determined to be necessary to evict.
* @param num_logical_blocks_before_eviction Number of logical blocks that the evicted-from sequence occupied before the eviction step.
* @return A vector of per-block rotation data, including the indices of blocks after eviction that should be rotated, and the pre-computed trigonometric coefficients necessary for rotation.
*/
std::vector<BlockRotationData> get_rotation_coefficients(const std::set<size_t>& evicted_block_logical_indices, size_t num_logical_blocks_before_eviction);

/**
* @return The size of the embedding dimension that this CacheRotationCalculator was initialized with.
*/
size_t get_head_size() const {
return m_head_size;
}
Expand Down
26 changes: 14 additions & 12 deletions src/cpp/src/continuous_batching_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -74,27 +74,27 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::init(
// and finally create model runner
bool is_use_cache_eviction = m_scheduler->get_config().use_cache_eviction;
if (is_use_cache_eviction) {
m_model_runner = std::make_shared<ModelRunner>(infer_request, updated_config, device_config.get_num_layers(),
/* m_collect_attention_scores = */ true);
m_model_runner = std::make_shared<ModelRunner>(infer_request, m_scheduler->get_block_size(), device_config.get_num_layers(),
/* collect_attention_scores = */ true,
/* is_use_per_layer_cache_control = */ true);
m_rotation_coefficient_stores.reserve(device_config.get_num_layers());
ov::Shape rotation_coefficient_store_shape{ device_config.get_head_size() * (scheduler_config.block_size * scheduler_config.num_kv_blocks) };
ov::Shape rotation_coefficient_store_shape{ device_config.get_head_size() * (m_scheduler->get_block_size() * scheduler_config.num_kv_blocks) };
for (size_t i = 0; i < device_config.get_num_layers(); i++) {
ov::Tensor store(ov::element::f32, rotation_coefficient_store_shape);
std::memset(store.data(), 0, store.get_byte_size());
m_rotation_coefficient_stores.push_back(store);
}
m_next_step_rotation_coefficients.resize(device_config.get_num_layers());
m_next_step_rotated_block_logical_indices_per_sequence.resize(device_config.get_num_layers());
m_cache_rotation_calculator = std::make_shared<CacheRotationCalculator>(scheduler_config.block_size,
m_cache_rotation_calculator = std::make_shared<CacheRotationCalculator>(m_scheduler->get_block_size(),
// TODO (vshampor): LUT size equal to max cache size in tokens
// is overkill - find a way to pass the max sequence length instead
scheduler_config.block_size * scheduler_config.num_kv_blocks,
m_scheduler->get_block_size() * scheduler_config.num_kv_blocks,
device_config.get_head_size());
} else {
m_model_runner = std::make_shared<ModelRunner>(infer_request, updated_config, device_config.get_num_layers());
m_model_runner = std::make_shared<ModelRunner>(infer_request, m_scheduler->get_block_size(), device_config.get_num_layers());
}

m_model_runner = std::make_shared<ModelRunner>(infer_request, m_scheduler->get_block_size(), device_config.get_num_layers(), is_use_cache_eviction);
m_sampler = std::make_shared<Sampler>(m_tokenizer);
m_sampler->set_seed(m_generation_config.rng_seed);

Expand Down Expand Up @@ -218,7 +218,8 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::step() {
// evict unimportant blocks from KV cache, if requested
if (sched_config.use_cache_eviction) {
maybe_evict_cache_blocks(sched_config);
m_model_runner->set_cache_rotation_data(m_next_step_rotation_coefficients, m_next_step_rotated_block_logical_indices_per_sequence);
m_model_runner->set_cache_rotation_data(std::move(m_next_step_rotation_coefficients),
std::move(m_next_step_rotated_block_logical_indices_per_sequence));
}

#ifdef DEBUG_CACHE_STATE_DUMP
Expand Down Expand Up @@ -407,6 +408,7 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::maybe_evict_cache_block
std::vector<size_t> num_blocks_to_rotate_for_each_layer(num_decoder_layers, 0);
size_t head_size = m_cache_rotation_calculator->get_head_size();

// necessary since we move from these members during previous steps
m_next_step_rotation_coefficients.clear();
m_next_step_rotated_block_logical_indices_per_sequence.clear();
m_next_step_rotated_block_logical_indices_per_sequence.resize(num_decoder_layers);
Expand All @@ -429,19 +431,19 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::maybe_evict_cache_block
}
size_t num_blocks_before_eviction = m_scheduler->get_block_tables(seq_id)[layer_idx].size();
auto rotation_multipliers =
m_cache_rotation_calculator->get_rotation_multipliers(logical_blocks_to_evict[layer_idx],
m_cache_rotation_calculator->get_rotation_coefficients(logical_blocks_to_evict[layer_idx],
num_blocks_before_eviction);
for (size_t i = 0; i < rotation_multipliers.size(); i++) {
const auto& block_rotation_data = rotation_multipliers[i];
const auto& rotation_multipliers_cos = block_rotation_data.cosines;
const auto& rotation_multipliers_sin = block_rotation_data.sines;
OPENVINO_ASSERT(rotation_multipliers_cos.size() == rotation_multipliers_sin.size());
OPENVINO_ASSERT(rotation_multipliers_cos.size() == sched_config.block_size);
OPENVINO_ASSERT(rotation_multipliers_cos.size() == m_scheduler->get_block_size());

m_next_step_rotated_block_logical_indices_per_sequence[layer_idx][seq_id].push_back(block_rotation_data.logical_block_idx);

// Fill the store tensor with rotation coefficient data - cos and sin coefficients are each contiguous, cos goes first
size_t block_offset = num_blocks_to_rotate_for_each_layer[layer_idx] * sched_config.block_size * head_size;
size_t block_offset = num_blocks_to_rotate_for_each_layer[layer_idx] * m_scheduler->get_block_size() * head_size;
auto rotation_multipliers_tensor_data = m_rotation_coefficient_stores[layer_idx].data<float>() + block_offset;
for (size_t tok_idx = 0; tok_idx < rotation_multipliers_cos.size(); tok_idx++) {
size_t position_offset = head_size * tok_idx;
Expand Down Expand Up @@ -471,7 +473,7 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::maybe_evict_cache_block

// Select the previously filled rotation coefficients from the store tensor
for (size_t i = 0; i < num_decoder_layers; i++) {
m_next_step_rotation_coefficients.emplace_back(m_rotation_coefficient_stores[i], ov::Coordinate{0}, ov::Coordinate{num_blocks_to_rotate_for_each_layer[i] * sched_config.block_size * head_size});
m_next_step_rotation_coefficients.emplace_back(m_rotation_coefficient_stores[i], ov::Coordinate{0}, ov::Coordinate{num_blocks_to_rotate_for_each_layer[i] * m_scheduler->get_block_size() * head_size});
}

for (const auto& seq_group_ptr_and_num_blocks_evicted : seq_group_to_num_blocks_evicted_map) {
Expand Down
Loading

0 comments on commit 5b61636

Please sign in to comment.