[NPU] update llm-npu-cli example (#12729)

rnwang04 · web-flow · commit 78cca0a68c82 · 2025-01-22T09:59:27.000+08:00
* update cli example

* add license

* rename

* update readme sample output
diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/CPP_Examples/CMakeLists.txt b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/CPP_Examples/CMakeLists.txt
@@ -18,20 +18,20 @@ endif()
 add_library(npu_llm STATIC IMPORTED)
 set_target_properties(npu_llm PROPERTIES IMPORTED_LOCATION ${LIBRARY_DIR}/npu_llm.lib)
 
-set(TARGET llm-npu-cli)
-add_executable(${TARGET} llm-npu-cli.cpp)
+set(TARGET llama-cli-npu)
+add_executable(${TARGET} llama-cli-npu.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE npu_llm)
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
 
-add_custom_command(TARGET llm-npu-cli POST_BUILD
+add_custom_command(TARGET llama-cli-npu POST_BUILD
     COMMAND ${CMAKE_COMMAND} -E copy_if_different
         ${LIBRARY_DIR}/npu_llm.dll
         ${CMAKE_BINARY_DIR}/Release/
     COMMENT "Copying npu_llm.dll to build/Release\n"
 )
 
-add_custom_command(TARGET llm-npu-cli POST_BUILD
+add_custom_command(TARGET llama-cli-npu POST_BUILD
     COMMAND ${CMAKE_COMMAND} -E copy_directory
         ${DLL_DIR}/
         ${CMAKE_BINARY_DIR}/Release/
diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/CPP_Examples/README.md b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/CPP_Examples/README.md
@@ -81,9 +81,9 @@ Arguments info:
 - `--max-prompt-len MAX_PROMPT_LEN`: argument defining the maximum number of tokens that the input prompt can contain. It is default to be `512`.
 - `--low-bit LOW_BIT`: argument defining the low bit optimizations that will be applied to the model. Current available options are `"sym_int4"`, `"asym_int4"` and `"sym_int8"`, with `"sym_int4"` as the default.
 
-## 3. Build C++ Example `llm-npu-cli`
+## 3. Build C++ Example `llama-cli-npu`(Optional)
 
-You can run below cmake script in cmd to build `llm-npu-cli`, don't forget to replace below conda env dir with your own path.
+- You can run below cmake script in cmd to build `llama-cli-npu` by yourself, don't forget to replace below <CONDA_ENV_DIR> with your own path.
 
 ```cmd
 :: under current directory
@@ -96,16 +96,21 @@ cmake --build . --config Release -j
 cd Release
 ```
 
-## 4. Run `llm-npu-cli`
+- You can also directly use our released `llama-cli-npu.exe` which has the same usage as this example `llama-cli-npu.cpp`
 
-With built `llm-npu-cli`, you can run the example with specified paramaters. For example,
+> [!NOTE]
+> Our released `llama-cli-npu.exe` can be found at <CONDA_ENV_DIR>\bigdl-core-npu
+
+## 4. Run `llama-cli-npu`
+
+With built `llama-cli-npu`, you can run the example with specified paramaters. For example,
 
 ```cmd
 # Run simple text completion
-llm-npu-cli.exe -m <converted_model_path> -n 64 "AI是什么?"
+llama-cli-npu.exe -m <converted_model_path> -n 64 "AI是什么?"
 
 # Run in conversation mode
-llm-npu-cli.exe -m <converted_model_path> -cnv
+llama-cli-npu.exe -m <converted_model_path> -cnv
 ```
 
 Arguments info:
@@ -118,18 +123,12 @@ Arguments info:
 #### [meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf)
 ##### Text Completion
 ```cmd
-Input:
-<s>[INST] <<SYS>>
-
-<</SYS>>
-
-What is AI? [/INST]
-
-Prefill 26 tokens cost xxxx ms.
+AI stands for Artificial Intelligence, which is the field of study focused on creating and developing intelligent machines that can perform tasks that typically require human intelligence, such as visual and auditory recognition, speech recognition, and decision-making. AI is a broad and diverse field that includes a wide range
 
-Decode 63 tokens cost xxxx ms (avg xxxx ms each token).
-Output:
- AI stands for Artificial Intelligence, which is the field of study focused on creating and developing intelligent machines that can perform tasks that typically require human intelligence, such as visual and auditory recognition, speech recognition, and decision-making. AI is a broad and diverse field that includes a wide range
+llm_perf_print:        load time =    xxxx.xx ms
+llm_perf_print: prompt eval time =    xxxx.xx ms /    26 tokens (   xx.xx ms per token,    xx.xx tokens per second)
+llm_perf_print:        eval time =    xxxx.xx ms /    63 runs   (   xx.xx ms per token,    xx.xx tokens per second)
+llm_perf_print:       total time =    xxxx.xx ms /    89 tokens
 ```
 
 ##### Conversation
diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/CPP_Examples/llama-cli-npu.cpp b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/CPP_Examples/llama-cli-npu.cpp
@@ -99,22 +99,15 @@ std::string add_chat_history(npu_model_params model_params,
 }
 
 std::string run_generate(void* void_model, int32_t* embd_inp_ptr, int32_t embd_inp_size,
-                         npu_model_params model_params, tokenizer_params tok_params, npu_generation_params generation_params, bool do_print){
-    auto start = std::chrono::high_resolution_clock::now();
+                         npu_model_params model_params, tokenizer_params tok_params, npu_generation_params generation_params){
     float* logits = run_prefill(void_model, embd_inp_ptr, embd_inp_size,
                                 generation_params.repetition_penalty);
     int32_t token = llm_sample_token(logits, true, model_params.vocab_size);
-    auto end = std::chrono::high_resolution_clock::now();
-    auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
-    if (do_print){
-        printf("\nPrefill %d tokens cost %d ms.\n", embd_inp_size, duration.count());
-    }
 
     std::vector<int32_t> embd;  // output ids
     embd.push_back(token);
 
     int token_nums = 0;
-    start = std::chrono::high_resolution_clock::now();
     for (int i = 1; i < generation_params.max_new_token; i++){
         auto logits = run_decode(void_model, embd[i-1],
                                  generation_params.repetition_penalty);
@@ -126,15 +119,9 @@ std::string run_generate(void* void_model, int32_t* embd_inp_ptr, int32_t embd_i
             break;
         }
     }
-    end = std::chrono::high_resolution_clock::now();
-    duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
 
     std::string output = llm_decode(embd);
 
-    if (do_print){
-        printf("\nDecode %d tokens cost %d ms (avg %f ms each token).\n", token_nums, duration.count(), (float)duration.count() / token_nums);
-    }
-
     return output;
 }
 
@@ -237,7 +224,7 @@ int main(int argc, char ** argv) {
                 generation_params.max_new_token = model_params.kv_len - embd_inp.size();
                 
                 response = run_generate(model, embd_inp.data(), embd_inp.size(),
-                                        model_params, tok_params, generation_params, false);
+                                        model_params, tok_params, generation_params);
 
                 std::cout << "Assistant:";
                 std::cout << response << std::endl;
@@ -250,18 +237,16 @@ int main(int argc, char ** argv) {
     }
     else{
         std::string full_prompt = add_chat_template(model_params, params.prompt);
-        std::cout << "Input: " << std::endl;
-        std::cout << full_prompt << std::endl;
 
         // tokenize input
         std::vector<int32_t> embd_inp = llm_tokenize(full_prompt, false);
 
         // single text generation
         std::string output = run_generate(model, embd_inp.data(), embd_inp.size(),
-                                          model_params, tok_params, generation_params, true);
+                                          model_params, tok_params, generation_params);
 
-        std::cout << "Output: " << std::endl;
-        std::cout << output << std::endl;
+        std::cout << output << std::endl << std::endl;
+        llm_perf_print(model);
     }
     return 0;
-}
+}