Skip to content

Commit 78cca0a

Browse files
authored
[NPU] update llm-npu-cli example (#12729)
* update cli example * add license * rename * update readme sample output
1 parent 7e29edc commit 78cca0a

File tree

3 files changed

+26
-42
lines changed

3 files changed

+26
-42
lines changed

python/llm/example/NPU/HF-Transformers-AutoModels/LLM/CPP_Examples/CMakeLists.txt

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,20 +18,20 @@ endif()
1818
add_library(npu_llm STATIC IMPORTED)
1919
set_target_properties(npu_llm PROPERTIES IMPORTED_LOCATION ${LIBRARY_DIR}/npu_llm.lib)
2020

21-
set(TARGET llm-npu-cli)
22-
add_executable(${TARGET} llm-npu-cli.cpp)
21+
set(TARGET llama-cli-npu)
22+
add_executable(${TARGET} llama-cli-npu.cpp)
2323
install(TARGETS ${TARGET} RUNTIME)
2424
target_link_libraries(${TARGET} PRIVATE npu_llm)
2525
target_compile_features(${TARGET} PRIVATE cxx_std_17)
2626

27-
add_custom_command(TARGET llm-npu-cli POST_BUILD
27+
add_custom_command(TARGET llama-cli-npu POST_BUILD
2828
COMMAND ${CMAKE_COMMAND} -E copy_if_different
2929
${LIBRARY_DIR}/npu_llm.dll
3030
${CMAKE_BINARY_DIR}/Release/
3131
COMMENT "Copying npu_llm.dll to build/Release\n"
3232
)
3333

34-
add_custom_command(TARGET llm-npu-cli POST_BUILD
34+
add_custom_command(TARGET llama-cli-npu POST_BUILD
3535
COMMAND ${CMAKE_COMMAND} -E copy_directory
3636
${DLL_DIR}/
3737
${CMAKE_BINARY_DIR}/Release/

python/llm/example/NPU/HF-Transformers-AutoModels/LLM/CPP_Examples/README.md

Lines changed: 16 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -81,9 +81,9 @@ Arguments info:
8181
- `--max-prompt-len MAX_PROMPT_LEN`: argument defining the maximum number of tokens that the input prompt can contain. It is default to be `512`.
8282
- `--low-bit LOW_BIT`: argument defining the low bit optimizations that will be applied to the model. Current available options are `"sym_int4"`, `"asym_int4"` and `"sym_int8"`, with `"sym_int4"` as the default.
8383

84-
## 3. Build C++ Example `llm-npu-cli`
84+
## 3. Build C++ Example `llama-cli-npu`(Optional)
8585

86-
You can run below cmake script in cmd to build `llm-npu-cli`, don't forget to replace below conda env dir with your own path.
86+
- You can run below cmake script in cmd to build `llama-cli-npu` by yourself, don't forget to replace below <CONDA_ENV_DIR> with your own path.
8787

8888
```cmd
8989
:: under current directory
@@ -96,16 +96,21 @@ cmake --build . --config Release -j
9696
cd Release
9797
```
9898

99-
## 4. Run `llm-npu-cli`
99+
- You can also directly use our released `llama-cli-npu.exe` which has the same usage as this example `llama-cli-npu.cpp`
100100

101-
With built `llm-npu-cli`, you can run the example with specified paramaters. For example,
101+
> [!NOTE]
102+
> Our released `llama-cli-npu.exe` can be found at <CONDA_ENV_DIR>\bigdl-core-npu
103+
104+
## 4. Run `llama-cli-npu`
105+
106+
With built `llama-cli-npu`, you can run the example with specified paramaters. For example,
102107

103108
```cmd
104109
# Run simple text completion
105-
llm-npu-cli.exe -m <converted_model_path> -n 64 "AI是什么?"
110+
llama-cli-npu.exe -m <converted_model_path> -n 64 "AI是什么?"
106111
107112
# Run in conversation mode
108-
llm-npu-cli.exe -m <converted_model_path> -cnv
113+
llama-cli-npu.exe -m <converted_model_path> -cnv
109114
```
110115

111116
Arguments info:
@@ -118,18 +123,12 @@ Arguments info:
118123
#### [meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf)
119124
##### Text Completion
120125
```cmd
121-
Input:
122-
<s>[INST] <<SYS>>
123-
124-
<</SYS>>
125-
126-
What is AI? [/INST]
127-
128-
Prefill 26 tokens cost xxxx ms.
126+
AI stands for Artificial Intelligence, which is the field of study focused on creating and developing intelligent machines that can perform tasks that typically require human intelligence, such as visual and auditory recognition, speech recognition, and decision-making. AI is a broad and diverse field that includes a wide range
129127
130-
Decode 63 tokens cost xxxx ms (avg xxxx ms each token).
131-
Output:
132-
AI stands for Artificial Intelligence, which is the field of study focused on creating and developing intelligent machines that can perform tasks that typically require human intelligence, such as visual and auditory recognition, speech recognition, and decision-making. AI is a broad and diverse field that includes a wide range
128+
llm_perf_print: load time = xxxx.xx ms
129+
llm_perf_print: prompt eval time = xxxx.xx ms / 26 tokens ( xx.xx ms per token, xx.xx tokens per second)
130+
llm_perf_print: eval time = xxxx.xx ms / 63 runs ( xx.xx ms per token, xx.xx tokens per second)
131+
llm_perf_print: total time = xxxx.xx ms / 89 tokens
133132
```
134133

135134
##### Conversation

python/llm/example/NPU/HF-Transformers-AutoModels/LLM/CPP_Examples/llm-npu-cli.cpp renamed to python/llm/example/NPU/HF-Transformers-AutoModels/LLM/CPP_Examples/llama-cli-npu.cpp

Lines changed: 6 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -99,22 +99,15 @@ std::string add_chat_history(npu_model_params model_params,
9999
}
100100

101101
std::string run_generate(void* void_model, int32_t* embd_inp_ptr, int32_t embd_inp_size,
102-
npu_model_params model_params, tokenizer_params tok_params, npu_generation_params generation_params, bool do_print){
103-
auto start = std::chrono::high_resolution_clock::now();
102+
npu_model_params model_params, tokenizer_params tok_params, npu_generation_params generation_params){
104103
float* logits = run_prefill(void_model, embd_inp_ptr, embd_inp_size,
105104
generation_params.repetition_penalty);
106105
int32_t token = llm_sample_token(logits, true, model_params.vocab_size);
107-
auto end = std::chrono::high_resolution_clock::now();
108-
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
109-
if (do_print){
110-
printf("\nPrefill %d tokens cost %d ms.\n", embd_inp_size, duration.count());
111-
}
112106

113107
std::vector<int32_t> embd; // output ids
114108
embd.push_back(token);
115109

116110
int token_nums = 0;
117-
start = std::chrono::high_resolution_clock::now();
118111
for (int i = 1; i < generation_params.max_new_token; i++){
119112
auto logits = run_decode(void_model, embd[i-1],
120113
generation_params.repetition_penalty);
@@ -126,15 +119,9 @@ std::string run_generate(void* void_model, int32_t* embd_inp_ptr, int32_t embd_i
126119
break;
127120
}
128121
}
129-
end = std::chrono::high_resolution_clock::now();
130-
duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
131122

132123
std::string output = llm_decode(embd);
133124

134-
if (do_print){
135-
printf("\nDecode %d tokens cost %d ms (avg %f ms each token).\n", token_nums, duration.count(), (float)duration.count() / token_nums);
136-
}
137-
138125
return output;
139126
}
140127

@@ -237,7 +224,7 @@ int main(int argc, char ** argv) {
237224
generation_params.max_new_token = model_params.kv_len - embd_inp.size();
238225

239226
response = run_generate(model, embd_inp.data(), embd_inp.size(),
240-
model_params, tok_params, generation_params, false);
227+
model_params, tok_params, generation_params);
241228

242229
std::cout << "Assistant:";
243230
std::cout << response << std::endl;
@@ -250,18 +237,16 @@ int main(int argc, char ** argv) {
250237
}
251238
else{
252239
std::string full_prompt = add_chat_template(model_params, params.prompt);
253-
std::cout << "Input: " << std::endl;
254-
std::cout << full_prompt << std::endl;
255240

256241
// tokenize input
257242
std::vector<int32_t> embd_inp = llm_tokenize(full_prompt, false);
258243

259244
// single text generation
260245
std::string output = run_generate(model, embd_inp.data(), embd_inp.size(),
261-
model_params, tok_params, generation_params, true);
246+
model_params, tok_params, generation_params);
262247

263-
std::cout << "Output: " << std::endl;
264-
std::cout << output << std::endl;
248+
std::cout << output << std::endl << std::endl;
249+
llm_perf_print(model);
265250
}
266251
return 0;
267-
}
252+
}

0 commit comments

Comments
 (0)