Skip to content

Commit 0e787d1

Browse files
committed
Merge remote-tracking branch 'upstream/master' into as/add_cb_ci_tests
2 parents 7356b2f + 6667c3d commit 0e787d1

25 files changed

+318
-333
lines changed

.github/workflows/causal_lm_cpp.yml

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -524,3 +524,63 @@ jobs:
524524
&& export PYTHONPATH=./build/:$PYTHONPATH
525525
&& timeout 50s samples/python/greedy_causal_lm/greedy_causal_lm.py ./redpajama-3b-chat/ "Alan Turing was a"
526526
| diff ./pred_greedy.txt -
527+
528+
cpp-chat_sample-ubuntu:
529+
runs-on: ubuntu-20.04
530+
steps:
531+
- uses: actions/checkout@v4
532+
with:
533+
submodules: recursive
534+
- uses: actions/setup-python@v4
535+
with:
536+
python-version: 3.8
537+
- name: Install OpenVINO
538+
run: |
539+
mkdir ./ov/
540+
curl ${{ env.l_ov_link }} | tar --directory ./ov/ --strip-components 1 -xz
541+
sudo ./ov/install_dependencies/install_openvino_dependencies.sh
542+
- name: Download, convert and build
543+
run: |
544+
source ./ov/setupvars.sh
545+
python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
546+
python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
547+
optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
548+
cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
549+
cmake --build ./build/ --config Release -j
550+
- name: Compare
551+
run: |
552+
source ./ov/setupvars.sh
553+
printf 'What is 2 + 2?\nWhat is the previous answer?\nAdd 1 to it.\nSubtract 5 from it.\nWhy is the sun yellow?\nWhat was my first question?\nStop!\n' > ./input.txt
554+
timeout 30s ./build/samples/cpp/chat_sample/chat_sample ./TinyLlama-1.1B-Chat-v1.0/ < input.txt > ./pred.txt
555+
python -c "
556+
from transformers import LlamaTokenizer, AutoModelForCausalLM
557+
model_id = 'TinyLlama/TinyLlama-1.1B-Chat-v1.0'
558+
tokenizer = LlamaTokenizer.from_pretrained(model_id)
559+
model = AutoModelForCausalLM.from_pretrained(model_id)
560+
prompts = ['What is 2 + 2?', 'What is the previous answer?', 'Add 1 to it.', 'Subtract 5 from it.', 'Why is the sun yellow?', 'What was my first question?']
561+
def gen_prompt(prompt):
562+
return {'role': 'user', 'content': prompt}
563+
def gen_answer(answer):
564+
return {'role': 'assistant', 'content': answer}
565+
chat_history = []
566+
chat_prompt = ''
567+
output = open('ref.txt', 'w')
568+
for prompt in prompts:
569+
output.write('question:\n')
570+
chat_history.append(gen_prompt(prompt))
571+
chat_prompt = tokenizer.apply_chat_template(chat_history, tokenize=False, add_generation_prompt=True)
572+
tokenized = tokenizer(chat_prompt, return_tensors='pt')
573+
answer = model.generate(**tokenized, max_length=1000, do_sample=False)
574+
answer_str = tokenizer.decode(answer[0, tokenized['input_ids'].numel():], skip_special_tokens=True)
575+
chat_history.append(gen_answer(answer_str))
576+
output.write(answer_str)
577+
output.write('\n----------\n')
578+
output.write('question:\n')
579+
output.close()
580+
"
581+
diff pred.txt ref.txt
582+
echo "Chat sample cpp" passed
583+
export PYTHONPATH=./build/:$PYTHONPATH
584+
timeout 30s ./samples/python/chat_sample/chat_sample.py ./TinyLlama-1.1B-Chat-v1.0/ < input.txt > ./pred2.txt
585+
diff pred2.txt ref.txt
586+
echo "Chat sample python" passed

llm_bench/python/convert.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1449,6 +1449,11 @@ def main():
14491449
action="store_true",
14501450
help="Apply AWQ algorithm during compression",
14511451
)
1452+
compression_group.add_argument(
1453+
"--scale_estimation",
1454+
action="store_true",
1455+
help="Apply scale estimation algorithm during compression",
1456+
)
14521457
add_stateful_model_arguments(parser)
14531458

14541459
args = parser.parse_args()

llm_bench/python/utils/conversion_utils/helpers.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -160,10 +160,14 @@ def get_data_aware_args(ov_model, tokenizer, config, compression_args, args):
160160
res['mode'] = dataset_args['sensitivity_metric']
161161
if 'awq' in dataset_args:
162162
res['awq'] = dataset_args['awq']
163+
if 'scale_estimation' in dataset_args:
164+
res['scale_estimation'] = dataset_args['scale_estimation']
163165
elif args.dataset is not None:
164166
dataset_params = args.dataset
165167
if args.awq:
166168
res['awq'] = args.awq
169+
if args.scale_estimation:
170+
res['scale_estimation'] = args.scale_estimation
167171

168172
if dataset_params is not None:
169173
# for example "wikitext,wikitext-2-v1,train[:1000],text"

samples/cpp/accuracy_sample/accuracy_sample.cpp

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -51,14 +51,14 @@ int main(int argc, char* argv[]) try {
5151
"What is OpenVINO?",
5252
};
5353

54-
std::vector<GenerationConfig> sampling_params_examples {
55-
GenerationConfig::beam_search(),
56-
GenerationConfig::greedy(),
57-
GenerationConfig::multinomial(),
54+
std::vector<ov::genai::GenerationConfig> sampling_params_examples {
55+
ov::genai::beam_search(),
56+
ov::genai::greedy(),
57+
ov::genai::multinomial(),
5858
};
5959

6060
std::vector<std::string> prompts(num_prompts);
61-
std::vector<GenerationConfig> sampling_params(num_prompts);
61+
std::vector<ov::genai::GenerationConfig> sampling_params(num_prompts);
6262

6363
for (size_t request_id = 0; request_id < num_prompts; ++request_id) {
6464
prompts[request_id] = prompt_examples[request_id % prompt_examples.size()];

samples/cpp/throughput_benchmark/throughput_benchmark.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ class AutoStartTimer {
3737

3838
struct Dataset {
3939
std::vector<std::string> m_prompts;
40-
std::vector<GenerationConfig> m_sampling_params;
40+
std::vector<ov::genai::GenerationConfig> m_sampling_params;
4141
std::vector<size_t> m_input_lens, m_output_lens;
4242

4343
size_t m_total_input_len = 0;
@@ -50,7 +50,7 @@ struct Dataset {
5050
m_output_lens.reserve(size);
5151
}
5252

53-
void push_data(std::string prompt, GenerationConfig sampling_params) {
53+
void push_data(std::string prompt, ov::genai::GenerationConfig sampling_params) {
5454
m_prompts.push_back(prompt);
5555
m_sampling_params.push_back(sampling_params);
5656
}
@@ -121,7 +121,7 @@ Dataset filtered_dataset(const std::string& models_path, const std::string& data
121121
if (input_len > max_input_len || (input_len + output_len) > 2048)
122122
continue;
123123

124-
GenerationConfig greedy_search = GenerationConfig::greedy();
124+
ov::genai::GenerationConfig greedy_search = ov::genai::greedy();
125125
greedy_search.max_new_tokens = std::min(max_output_len, output_len);
126126

127127
dataset.push_data(human_question, greedy_search);

src/cpp/continuous_batching/CMakeLists.txt

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,6 @@ find_file(spda_to_pa_header sdpa_to_paged_attention.hpp
2828
set(TARGET_NAME openvino_continuous_batching)
2929

3030
add_library(${TARGET_NAME} STATIC
31-
src/generation_config.cpp
3231
src/generation_handle.cpp
3332
src/continuous_batching_pipeline.cpp
3433
src/paged_attention_transformations.cpp)

src/cpp/continuous_batching/include/continuous_batching_pipeline.hpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88

99
#include "scheduler_config.hpp"
1010
#include "openvino/genai/tokenizer.hpp"
11-
#include "generation_config.hpp"
11+
#include "openvino/genai/generation_config.hpp"
1212
#include "generation_handle.hpp"
1313

1414
struct PipelineMetrics {
@@ -32,16 +32,16 @@ class ContinuousBatchingPipeline {
3232

3333
std::shared_ptr<ov::genai::Tokenizer> get_tokenizer();
3434

35-
GenerationConfig get_config() const;
35+
ov::genai::GenerationConfig get_config() const;
3636

3737
PipelineMetrics get_metrics() const;
3838

39-
GenerationHandle add_request(uint64_t request_id, std::string prompt, GenerationConfig sampling_params);
39+
GenerationHandle add_request(uint64_t request_id, std::string prompt, ov::genai::GenerationConfig sampling_params);
4040

4141
void step();
4242

4343
bool has_non_finished_requests();
4444

4545
// more high level interface, which can process multiple prompts in continuous batching manner
46-
std::vector<GenerationResult> generate(const std::vector<std::string>& prompts, std::vector<GenerationConfig> sampling_params);
46+
std::vector<GenerationResult> generate(const std::vector<std::string>& prompts, std::vector<ov::genai::GenerationConfig> sampling_params);
4747
};

src/cpp/continuous_batching/include/generation_config.hpp

Lines changed: 0 additions & 78 deletions
This file was deleted.

src/cpp/continuous_batching/include/generation_handle.hpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
#include <memory>
77
#include <unordered_map>
88

9-
#include "generation_config.hpp"
9+
#include "openvino/genai/generation_config.hpp"
1010

1111

1212
enum class GenerationStatus {
@@ -42,10 +42,10 @@ class GenerationStream;
4242

4343
class GenerationHandleImpl {
4444
std::shared_ptr<GenerationStream> m_generation_stream;
45-
GenerationConfig m_sampling_params;
45+
ov::genai::GenerationConfig m_sampling_params;
4646

4747
public:
48-
GenerationHandleImpl(std::shared_ptr<GenerationStream> generation_stream, const GenerationConfig& sampling_params) :
48+
GenerationHandleImpl(std::shared_ptr<GenerationStream> generation_stream, const ov::genai::GenerationConfig& sampling_params) :
4949
m_generation_stream(generation_stream),
5050
m_sampling_params(sampling_params) {};
5151

src/cpp/continuous_batching/src/continuous_batching_pipeline.cpp

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ class ContinuousBatchingPipeline::Impl {
2626

2727
// TODO (mzegla): GenerationConfig is request specific object
2828
// and pipeline only uses default rng_seed.
29-
GenerationConfig m_generation_config;
29+
ov::genai::GenerationConfig m_generation_config;
3030

3131
PipelineMetrics m_pipeline_metrics;
3232

@@ -103,7 +103,7 @@ class ContinuousBatchingPipeline::Impl {
103103
// read default generation config
104104
}
105105

106-
GenerationConfig get_config() const {
106+
ov::genai::GenerationConfig get_config() const {
107107
return m_generation_config;
108108
}
109109

@@ -115,7 +115,7 @@ class ContinuousBatchingPipeline::Impl {
115115
return m_tokenizer;
116116
}
117117

118-
GenerationHandle add_request(uint64_t request_id, std::string prompt, GenerationConfig sampling_params) {
118+
GenerationHandle add_request(uint64_t request_id, std::string prompt, ov::genai::GenerationConfig sampling_params) {
119119
sampling_params.set_eos_token_id(m_tokenizer->get_eos_token_id());
120120
sampling_params.validate();
121121

@@ -233,7 +233,7 @@ class ContinuousBatchingPipeline::Impl {
233233
return !m_awaiting_requests.empty() || !m_requests.empty();
234234
}
235235

236-
std::vector<GenerationResult> generate(const std::vector<std::string> prompts, std::vector<GenerationConfig> sampling_params) {
236+
std::vector<GenerationResult> generate(const std::vector<std::string> prompts, std::vector<ov::genai::GenerationConfig> sampling_params) {
237237
OPENVINO_ASSERT(!has_non_finished_requests(), "Generate cannot be called while ContinuousBatchingPipeline is already in running state. Use ContinuousBatchingPipeline::add_request");
238238
OPENVINO_ASSERT(prompts.size() == sampling_params.size());
239239

@@ -285,15 +285,15 @@ std::shared_ptr<ov::genai::Tokenizer> ContinuousBatchingPipeline::get_tokenizer(
285285
return m_impl->get_tokenizer();
286286
}
287287

288-
GenerationConfig ContinuousBatchingPipeline::get_config() const{
288+
ov::genai::GenerationConfig ContinuousBatchingPipeline::get_config() const{
289289
return m_impl->get_config();
290290
}
291291

292292
PipelineMetrics ContinuousBatchingPipeline::get_metrics() const{
293293
return m_impl->get_metrics();
294294
}
295295

296-
GenerationHandle ContinuousBatchingPipeline::add_request(uint64_t request_id, std::string prompt, GenerationConfig sampling_params) {
296+
GenerationHandle ContinuousBatchingPipeline::add_request(uint64_t request_id, std::string prompt, ov::genai::GenerationConfig sampling_params) {
297297
return m_impl->add_request(request_id, prompt, sampling_params);
298298
}
299299

@@ -305,6 +305,6 @@ bool ContinuousBatchingPipeline::has_non_finished_requests() {
305305
return m_impl->has_non_finished_requests();
306306
}
307307

308-
std::vector<GenerationResult> ContinuousBatchingPipeline::generate(const std::vector<std::string>& prompts, std::vector<GenerationConfig> sampling_params) {
308+
std::vector<GenerationResult> ContinuousBatchingPipeline::generate(const std::vector<std::string>& prompts, std::vector<ov::genai::GenerationConfig> sampling_params) {
309309
return m_impl->generate(prompts, sampling_params);
310310
}

0 commit comments

Comments
 (0)