Skip to content

Commit

Permalink
Update greedy_causal_lm.cpp to read EOS Token (#315)
Browse files Browse the repository at this point in the history
*Details:* Made*changes to accommodate the dynamic EOS Token
*Tickets:*  #277 132861
  • Loading branch information
anzr299 authored Apr 9, 2024
1 parent c21b149 commit 72caf05
Show file tree
Hide file tree
Showing 4 changed files with 60 additions and 47 deletions.
28 changes: 20 additions & 8 deletions text_generation/causal_lm/cpp/beam_search_causal_lm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ std::string detokenize(ov::InferRequest& detokenizer, const std::vector<int64_t>
detokenizer.infer();
return detokenizer.get_output_tensor().data<std::string>()[0];
}
} // namespace
}

int main(int argc, char* argv[]) try {
if (argc != 3) {
Expand All @@ -31,15 +31,17 @@ int main(int argc, char* argv[]) try {
// Compile models
ov::Core core;
core.add_extension(OPENVINO_TOKENIZERS_PATH); // OPENVINO_TOKENIZERS_PATH is defined in CMakeLists.txt
//Read the tokenizer model information from the file to later get the runtime information
auto tokenizer_model = core.read_model(std::string{argv[1]} + "/openvino_tokenizer.xml");
// tokenizer and detokenizer work on CPU only
ov::InferRequest tokenizer =
core.compile_model(std::string{argv[1]} + "/openvino_tokenizer.xml", "CPU").create_infer_request();
ov::InferRequest tokenizer = core.compile_model(
tokenizer_model, "CPU").create_infer_request();
auto [input_ids, attention_mask] = tokenize(tokenizer, argv[2]);
ov::InferRequest detokenizer =
core.compile_model(std::string{argv[1]} + "/openvino_detokenizer.xml", "CPU").create_infer_request();
ov::InferRequest detokenizer = core.compile_model(
std::string{argv[1]} + "/openvino_detokenizer.xml", "CPU").create_infer_request();
// The model can be compiled for GPU as well
ov::InferRequest lm =
core.compile_model(std::string{argv[1]} + "/openvino_model.xml", "CPU").create_infer_request();
ov::InferRequest lm = core.compile_model(
std::string{argv[1]} + "/openvino_model.xml", "CPU").create_infer_request();
// Initialize inputs
lm.set_tensor("input_ids", input_ids);
lm.set_tensor("attention_mask", attention_mask);
Expand All @@ -49,8 +51,18 @@ int main(int argc, char* argv[]) try {
lm.get_tensor("beam_idx").set_shape({1});
lm.get_tensor("beam_idx").data<int32_t>()[0] = 0;

// Get the runtime info from the tokenizer model that we read earlier
auto rt_info = tokenizer_model->get_rt_info(); //Get the runtime info for the model
int64_t SPECIAL_EOS_TOKEN;

if (rt_info.count("eos_token_id") > 0) { //check if the runtime information has a valid EOS token ID
SPECIAL_EOS_TOKEN = rt_info["eos_token_id"].as<int64_t>();

} else {
throw std::runtime_error("EOS token ID not found in model's runtime information.");
}
const int64_t* prompt_data = input_ids.data<const int64_t>();
Parameters parameters{std::vector<int64_t>{prompt_data, prompt_data + input_ids.get_size()}};
Parameters parameters{std::vector<int64_t>{prompt_data, prompt_data + input_ids.get_size()}, SPECIAL_EOS_TOKEN};
GroupBeamSearcher group_beam_searcher{parameters};
std::vector<int64_t> next_tokens;
std::vector<int32_t> next_beams;
Expand Down
18 changes: 14 additions & 4 deletions text_generation/causal_lm/cpp/greedy_causal_lm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -61,9 +61,11 @@ int main(int argc, char* argv[]) try {
// Compile models
ov::Core core;
core.add_extension(OPENVINO_TOKENIZERS_PATH); // OPENVINO_TOKENIZERS_PATH is defined in CMakeLists.txt
//Read the tokenizer model information from the file to later get the runtime information
auto tokenizer_model = core.read_model(std::string{argv[1]} + "/openvino_tokenizer.xml");
// tokenizer and detokenizer work on CPU only
ov::InferRequest tokenizer = core.compile_model(
std::string{argv[1]} + "/openvino_tokenizer.xml", "CPU").create_infer_request();
tokenizer_model, "CPU").create_infer_request();
auto [input_ids, attention_mask] = tokenize(tokenizer, argv[2]);
ov::InferRequest detokenizer = core.compile_model(
std::string{argv[1]} + "/openvino_detokenizer.xml", "CPU").create_infer_request();
Expand Down Expand Up @@ -91,9 +93,17 @@ int main(int argc, char* argv[]) try {
lm.get_tensor("input_ids").set_shape({BATCH_SIZE, 1});
position_ids.set_shape({BATCH_SIZE, 1});
TextStreamer text_streamer{std::move(detokenizer)};
// There's no way to extract special token values from the detokenizer for now
constexpr int64_t SPECIAL_EOS_TOKEN = 2;


// Get the runtime info from the tokenizer model that we read earlier
auto rt_info = tokenizer_model->get_rt_info(); //Get the runtime info for the model
int64_t SPECIAL_EOS_TOKEN;

if (rt_info.count("eos_token_id") > 0) { //check if the runtime information has a valid EOS token ID
SPECIAL_EOS_TOKEN = rt_info["eos_token_id"].as<int64_t>();
} else {
throw std::runtime_error("EOS token ID not found in model's runtime information.");
}

int max_sequence_length = 100;
while (out_token != SPECIAL_EOS_TOKEN && seq_len < max_sequence_length) {
++seq_len;
Expand Down
59 changes: 25 additions & 34 deletions text_generation/causal_lm/cpp/group_beam_searcher.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,10 +44,7 @@ std::vector<int64_t> kmp_search(const std::vector<int64_t>& haystack, const std:
return res;
}

struct Token {
float log_prob;
int64_t idx;
};
struct Token {float log_prob; int64_t idx;};

std::vector<Token> log_softmax(const ov::Tensor& logits, size_t batch_idx) {
if (logits.get_shape().at(0) <= batch_idx) {
Expand All @@ -58,10 +55,10 @@ std::vector<Token> log_softmax(const ov::Tensor& logits, size_t batch_idx) {
size_t sequence_offset = (logits.get_shape().at(1) - 1) * vocab_size;
const float* beam_logits = logits.data<const float>() + batch_offset + sequence_offset;
float max_logit = *std::max_element(beam_logits, beam_logits + vocab_size);
float log_sum = std::log(
std::accumulate(beam_logits, beam_logits + vocab_size, 0.0f, [max_logit](float accumulated, float to_add) {
float log_sum = std::log(std::accumulate(
beam_logits, beam_logits + vocab_size, 0.0f, [max_logit](float accumulated, float to_add) {
return accumulated + std::exp(to_add - max_logit);
}));
}));
std::vector<Token> tokens;
tokens.reserve(vocab_size);
for (size_t idx = 0; idx < vocab_size; ++idx) {
Expand All @@ -80,26 +77,24 @@ bool greater(const Beam& left, const Beam& right) {
return left.score > right.score;
}

enum class StopCriteria { early, heuristic, never };
enum class StopCriteria {early, heuristic, never};

struct Parameters {
std::vector<int64_t> prompt;
int64_t eos_token;
size_t n_groups = 3;
size_t group_size = 5;
float diversity_penalty = 1.0;
size_t max_new_tokens = 20;
StopCriteria stop_criteria = StopCriteria::heuristic;
float length_penalty = 1.0;
size_t no_repeat_ngram_size = std::numeric_limits<size_t>::max();
// There's no way to extract special token values from the tokenizer for now
int64_t eos_token = 2;
std::function<bool(const Beam&)> early_finish = [](const Beam&) {
return false;
};

std::function<bool(const Beam&)> early_finish = [](const Beam&){return false;};
};

struct Group {
std::vector<Beam> ongoing; // Best beams in front
std::vector<Beam> ongoing; // Best beams in front
std::vector<Beam> min_heap; // The worst of the best completed beams is the first
bool done = false;

Expand All @@ -126,30 +121,26 @@ struct Group {
float best_sum_logprobs = ongoing.front().score;
float worst_score = min_heap.front().score;
switch (parameters.stop_criteria) {
case StopCriteria::early:
done = true;
return;
case StopCriteria::heuristic: {
float highest_attainable_score = best_sum_logprobs / std::pow(float(cur_len), parameters.length_penalty);
done = worst_score >= highest_attainable_score;
return;
}
case StopCriteria::never: {
size_t length = parameters.length_penalty > 0.0 ? parameters.max_new_tokens : cur_len;
float highest_attainable_score = best_sum_logprobs / std::pow(float(length), parameters.length_penalty);
done = worst_score >= highest_attainable_score;
return;
}
default:
throw std::runtime_error("Never reached");
case StopCriteria::early:
done = true;
return;
case StopCriteria::heuristic: {
float highest_attainable_score = best_sum_logprobs / std::pow(float(cur_len), parameters.length_penalty);
done = worst_score >= highest_attainable_score;
return;
}
case StopCriteria::never: {
size_t length = parameters.length_penalty > 0.0 ? parameters.max_new_tokens : cur_len;
float highest_attainable_score = best_sum_logprobs / std::pow(float(length), parameters.length_penalty);
done = worst_score >= highest_attainable_score;
return;
}
default: throw std::runtime_error("Never reached");
}
}
};

struct TokenToBeam {
int64_t token_idx;
int32_t beam_idx;
};
struct TokenToBeam {int64_t token_idx; int32_t beam_idx;};

// GroupBeamSearcher processes logits prduced by a language model and accumulates beams using group beam search
// algorithm. select_next_tokens() returns token ids selected by the algorithm and corresponding beam ids. These values
Expand Down

0 comments on commit 72caf05

Please sign in to comment.