Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 7 additions & 6 deletions cmake/LlamaCpp.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ set(BUILD_SHARED_LIBS "OFF" CACHE STRING "" FORCE)
set(LLAMA_BUILD_TESTS "OFF" CACHE STRING "" FORCE)
set(LLAMA_BUILD_EXAMPLES "OFF" CACHE STRING "" FORCE)
set(LLAMA_BUILD_SERVER "OFF" CACHE STRING "" FORCE)
set(LLAMA_BUILD_COMMON "ON" CACHE STRING "" FORCE)
set(GGML_OPENMP "OFF" CACHE STRING "" FORCE)
set(GGML_METAL "OFF" CACHE STRING "" FORCE)
set(GGML_BLAS "OFF" CACHE STRING "" FORCE)
Expand All @@ -30,16 +31,15 @@ else()
set(GGML_NATIVE "ON" CACHE STRING "" FORCE)
endif()

set(PATCH_FILE_1 "${CMAKE_SOURCE_DIR}/thirdparty/llamacpp/lu8_macro_fix.patch") # https://github.com/ggml-org/llama.cpp/issues/12740
set(PATCH_FILE_2 "${CMAKE_SOURCE_DIR}/thirdparty/llamacpp/cpp-23-fixes.patch")
set(PATCH_FILE_1 "${CMAKE_SOURCE_DIR}/thirdparty/llamacpp/mtmd-fix.patch")

set(PC ${Bash_EXECUTABLE} -c "set -x &&\
(\\\"${Patch_EXECUTABLE}\\\" -p1 -R -s -f --dry-run -i \\\"${PATCH_FILE_1}\\\" || \\\"${Patch_EXECUTABLE}\\\" -p1 -N -i \\\"${PATCH_FILE_1}\\\") &&\
(\\\"${Patch_EXECUTABLE}\\\" -p1 -R -s -f --dry-run -i \\\"${PATCH_FILE_2}\\\" || \\\"${Patch_EXECUTABLE}\\\" -p1 -N -i \\\"${PATCH_FILE_2}\\\")")
(\\\"${Patch_EXECUTABLE}\\\" -p1 -R -s -f --dry-run -i \\\"${PATCH_FILE_1}\\\" || \\\"${Patch_EXECUTABLE}\\\" -p1 -N -i \\\"${PATCH_FILE_1}\\\")")


FetchContent_Declare(llamacpp
URL https://github.com/ggml-org/llama.cpp/archive/refs/tags/b5958.tar.gz
URL_HASH SHA256=4e8a2abd83092aa446cd13556f6fe8777139da7b191bdaa0e1b79fe9740b36a6
URL https://github.com/ggml-org/llama.cpp/archive/refs/tags/b7836.tar.gz
URL_HASH SHA256=3d384e7e8b3bc3cd31abddedf684a6e201405c1d932cafb3c4a5277d872b0614
PATCH_COMMAND "${PC}"
SYSTEM
)
Expand All @@ -49,5 +49,6 @@ FetchContent_MakeAvailable(llamacpp)
set(LLAMACPP_INCLUDE_DIRS
"${llamacpp_SOURCE_DIR}/include"
"${llamacpp_SOURCE_DIR}/ggml/include"
"${llamacpp_SOURCE_DIR}/tools"
CACHE STRING "" FORCE
)
2 changes: 1 addition & 1 deletion extensions/llamacpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ add_minifi_library(minifi-llamacpp SHARED ${SOURCES})
target_include_directories(minifi-llamacpp PUBLIC "${CMAKE_SOURCE_DIR}/extensions/llamacpp")
target_include_directories(minifi-llamacpp PUBLIC "${LLAMACPP_INCLUDE_DIRS}")

target_link_libraries(minifi-llamacpp minifi-cpp-extension-lib llama)
target_link_libraries(minifi-llamacpp minifi-cpp-extension-lib llama mtmd)

register_c_api_extension(minifi-llamacpp "LLAMACPP EXTENSION" LLAMACPP-EXTENSION "Provides llama.cpp support" "extensions/llamacpp/tests")

148 changes: 133 additions & 15 deletions extensions/llamacpp/processors/DefaultLlamaContext.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,12 @@
*/

#include "DefaultLlamaContext.h"

#include <ranges>

#include "minifi-cpp/Exception.h"
#include "fmt/format.h"
#include "mtmd/mtmd-helper.h"

namespace org::apache::nifi::minifi::extensions::llamacpp::processors {

Expand All @@ -41,7 +45,8 @@ constexpr size_t DEFAULT_BUFFER_SIZE = 4096;
} // namespace


DefaultLlamaContext::DefaultLlamaContext(const std::filesystem::path& model_path, const LlamaSamplerParams& llama_sampler_params, const LlamaContextParams& llama_ctx_params) {
DefaultLlamaContext::DefaultLlamaContext(const std::filesystem::path& model_path, const std::optional<std::filesystem::path>& multimodal_model_path,
const LlamaSamplerParams& llama_sampler_params, const LlamaContextParams& llama_ctx_params, const std::shared_ptr<core::logging::Logger>& logger) {
llama_model_ = llama_model_load_from_file(model_path.string().c_str(), llama_model_default_params()); // NOLINT(cppcoreguidelines-prefer-member-initializer)
if (!llama_model_) {
throw Exception(ExceptionType::PROCESS_SCHEDULE_EXCEPTION, fmt::format("Failed to load model from '{}'", model_path.string()));
Expand All @@ -54,7 +59,7 @@ DefaultLlamaContext::DefaultLlamaContext(const std::filesystem::path& model_path
ctx_params.n_seq_max = llama_ctx_params.n_seq_max;
ctx_params.n_threads = llama_ctx_params.n_threads;
ctx_params.n_threads_batch = llama_ctx_params.n_threads_batch;
ctx_params.flash_attn = false;
ctx_params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_DISABLED;
llama_ctx_ = llama_init_from_model(llama_model_, ctx_params);

auto sparams = llama_sampler_chain_default_params();
Expand All @@ -73,9 +78,27 @@ DefaultLlamaContext::DefaultLlamaContext(const std::filesystem::path& model_path
llama_sampler_chain_add(llama_sampler_, llama_sampler_init_temp(*llama_sampler_params.temperature));
}
llama_sampler_chain_add(llama_sampler_, llama_sampler_init_dist(LLAMA_DEFAULT_SEED));

if (!multimodal_model_path) {
logger->log_info("No multimodal model path provided");
return;
}

mtmd_context_params mparams = mtmd_context_params_default();
mparams.use_gpu = false;
mparams.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_DISABLED;

multimodal_ctx_ = mtmd_init_from_file(multimodal_model_path->string().c_str(), llama_model_, mparams);
if (!multimodal_ctx_) {
throw Exception(ExceptionType::PROCESS_SCHEDULE_EXCEPTION, fmt::format("Failed to load multimodal model from '{}'", multimodal_model_path->string()));
}

logger->log_info("Successfully loaded multimodal model from '{}'", multimodal_model_path->string());
}

DefaultLlamaContext::~DefaultLlamaContext() {
mtmd_free(multimodal_ctx_);
multimodal_ctx_ = nullptr;
llama_sampler_free(llama_sampler_);
llama_sampler_ = nullptr;
llama_free(llama_ctx_);
Expand Down Expand Up @@ -108,24 +131,100 @@ std::optional<std::string> DefaultLlamaContext::applyTemplate(const std::vector<
return text;
}

nonstd::expected<GenerationResult, std::string> DefaultLlamaContext::generate(const std::string& input, std::function<void(std::string_view/*token*/)> token_handler) {
namespace {

struct mtmd_bitmap_deleter {
void operator()(mtmd_bitmap* val) { mtmd_bitmap_free(val); }
};
using unique_bitmap_ptr = std::unique_ptr<mtmd_bitmap, mtmd_bitmap_deleter>;

struct mtmd_input_chunks_deleter {
void operator()(mtmd_input_chunks* val) { mtmd_input_chunks_free(val); }
};
using unique_mtmd_input_chunks_ptr = std::unique_ptr<mtmd_input_chunks, mtmd_input_chunks_deleter>;

class unique_llama_batch {
public:
explicit unique_llama_batch(std::optional<llama_batch> batch = std::nullopt): batch_(std::move(batch)) {}

unique_llama_batch(unique_llama_batch&&) = default;
unique_llama_batch& operator=(unique_llama_batch&&) = default;
unique_llama_batch(const unique_llama_batch&) = delete;
unique_llama_batch& operator=(const unique_llama_batch&) = delete;

std::optional<llama_batch> get() const {
return batch_;
}

std::optional<llama_batch>& operator->() {
return batch_;
}

void reset(std::optional<llama_batch> batch = std::nullopt) {
if (batch_) {
llama_batch_free(batch_.value());
}
batch_ = std::move(batch);
}

~unique_llama_batch() {
if (batch_) {
llama_batch_free(batch_.value());
}
batch_.reset();
}

private:
std::optional<llama_batch> batch_;
};

} // namespace

nonstd::expected<GenerationResult, std::string> DefaultLlamaContext::generate(const std::string& prompt, const std::vector<std::vector<std::byte>>& files, std::function<void(std::string_view/*token*/)> token_handler) {
GenerationResult result{};
auto start_time = std::chrono::steady_clock::now();
llama_memory_seq_rm(llama_get_memory(llama_ctx_), 0, -1, -1);
const llama_vocab * vocab = llama_model_get_vocab(llama_model_);
std::vector<llama_token> tokenized_input = tokenizeInput(vocab, input);
result.num_tokens_in = gsl::narrow<uint64_t>(tokenized_input.size());
llama_pos n_past = 0;
std::vector<llama_token> tokenized_input;
unique_llama_batch batch;
int32_t decode_status = 0;
if (multimodal_ctx_) {
gsl_Assert(!files.empty());
std::vector<unique_bitmap_ptr> bitmaps;
for (auto& file : files) {
unique_bitmap_ptr bitmap{mtmd_helper_bitmap_init_from_buf(multimodal_ctx_, reinterpret_cast<const unsigned char*>(file.data()), file.size())};
if (!bitmap) {
throw Exception(PROCESSOR_EXCEPTION, "Failed to create multimodal bitmap from buffer");
}
bitmaps.push_back(std::move(bitmap));
}
mtmd_input_text inp_txt = {
.text = prompt.c_str(),
.add_special = true,
.parse_special = true,
};
unique_mtmd_input_chunks_ptr chunks{mtmd_input_chunks_init()};
auto bitmap_c_ptrs = bitmaps | std::views::transform([] (auto& ptr) {return ptr.get();}) | std::ranges::to<std::vector<const mtmd_bitmap*>>();
auto tokenized = mtmd_tokenize(multimodal_ctx_, chunks.get(), &inp_txt, bitmap_c_ptrs.data(), bitmap_c_ptrs.size());
if (tokenized != 0) {
throw Exception(PROCESSOR_EXCEPTION, fmt::format("Failed to tokenize multimodal prompt, error: {}", tokenized));
}
auto status = mtmd_helper_eval_chunks(multimodal_ctx_, llama_ctx_, chunks.get(), 0, 0, 1, true, &n_past);
if (status != 0) {
throw Exception(PROCESSOR_EXCEPTION, fmt::format("Failed to eval multimodal chunks, error: {}", status));
}
} else {
gsl_Assert(files.empty());
tokenized_input = tokenizeInput(vocab, prompt);
n_past = gsl::narrow<llama_pos>(tokenized_input.size());
result.num_tokens_in = gsl::narrow<uint64_t>(tokenized_input.size());
decode_status = llama_decode(llama_ctx_, llama_batch_get_one(tokenized_input.data(), n_past));
}

llama_batch batch = llama_batch_get_one(tokenized_input.data(), gsl::narrow<int32_t>(tokenized_input.size()));
llama_token new_token_id = 0;
bool first_token_generated = false;
while (true) {
int32_t res = llama_decode(llama_ctx_, batch);
if (res == 1) {
return nonstd::make_unexpected("Could not find a KV slot for the batch (try reducing the size of the batch or increase the context)");
} else if (res < 0) {
return nonstd::make_unexpected("Error occurred while executing llama decode");
}

while (decode_status == 0) {
new_token_id = llama_sampler_sample(llama_sampler_, llama_ctx_, -1);
if (!first_token_generated) {
result.time_to_first_token = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::steady_clock::now() - start_time);
Expand All @@ -147,8 +246,27 @@ nonstd::expected<GenerationResult, std::string> DefaultLlamaContext::generate(co
gsl_Assert(len < 128);

std::string_view token_str{buf.data(), gsl::narrow<std::string_view::size_type>(len)};
batch = llama_batch_get_one(&new_token_id, 1);
batch.reset(llama_batch_init(1, 0, 1));
batch->n_tokens = 1;
batch->token[0] = new_token_id;
batch->pos[0] = n_past;
batch->n_seq_id[0] = 1;
batch->seq_id[0][0] = 0;
batch->logits[0] = true;
++n_past;
token_handler(token_str);

decode_status = llama_decode(llama_ctx_, batch.get().value());
}

if (decode_status == 1) {
return nonstd::make_unexpected("Could not find a KV slot for the batch (try reducing the size of the batch or increase the context)");
}
if (decode_status == 2) {
return nonstd::make_unexpected("Llama decode aborted");
}
if (decode_status < 0) {
return nonstd::make_unexpected("Error occurred while executing llama decode");
}

result.tokens_per_second =
Expand Down
8 changes: 6 additions & 2 deletions extensions/llamacpp/processors/DefaultLlamaContext.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,25 +19,29 @@
#include "LlamaContext.h"
#include "llama.h"
#include "LlamaBackendInitializer.h"
#include "mtmd/mtmd.h"
#include "minifi-cpp/core/logging/Logger.h"

namespace org::apache::nifi::minifi::extensions::llamacpp::processors {

class DefaultLlamaContext : public LlamaContext {
public:
DefaultLlamaContext(const std::filesystem::path& model_path, const LlamaSamplerParams& llama_sampler_params, const LlamaContextParams& llama_ctx_params);
DefaultLlamaContext(const std::filesystem::path& model_path, const std::optional<std::filesystem::path>& multimodal_model_path,
const LlamaSamplerParams& llama_sampler_params, const LlamaContextParams& llama_ctx_params, const std::shared_ptr<core::logging::Logger>& logger);
DefaultLlamaContext(const DefaultLlamaContext&) = delete;
DefaultLlamaContext(DefaultLlamaContext&&) = delete;
DefaultLlamaContext& operator=(const DefaultLlamaContext&) = delete;
DefaultLlamaContext& operator=(DefaultLlamaContext&&) = delete;
~DefaultLlamaContext() override;

std::optional<std::string> applyTemplate(const std::vector<LlamaChatMessage>& messages) override;
nonstd::expected<GenerationResult, std::string> generate(const std::string& input, std::function<void(std::string_view/*token*/)> token_handler) override;
nonstd::expected<GenerationResult, std::string> generate(const std::string& prompt, const std::vector<std::vector<std::byte>>& files, std::function<void(std::string_view/*token*/)> token_handler) override;

private:
const LlamaBackendInitializer& llama_context_initializer_ = LlamaBackendInitializer::get();
llama_model* llama_model_{};
llama_context* llama_ctx_{};
mtmd_context* multimodal_ctx_{};
llama_sampler* llama_sampler_{};
};

Expand Down
2 changes: 1 addition & 1 deletion extensions/llamacpp/processors/LlamaContext.h
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ struct GenerationResult {
class LlamaContext {
public:
virtual std::optional<std::string> applyTemplate(const std::vector<LlamaChatMessage>& messages) = 0;
virtual nonstd::expected<GenerationResult, std::string> generate(const std::string& input, std::function<void(std::string_view/*token*/)> token_handler) = 0;
virtual nonstd::expected<GenerationResult, std::string> generate(const std::string& prompt, const std::vector<std::vector<std::byte>>& files, std::function<void(std::string_view/*token*/)> token_handler) = 0;
virtual ~LlamaContext() = default;
};

Expand Down
21 changes: 17 additions & 4 deletions extensions/llamacpp/processors/RunLlamaCppInference.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,9 @@ namespace org::apache::nifi::minifi::extensions::llamacpp::processors {
MinifiStatus RunLlamaCppInference::onScheduleImpl(api::core::ProcessContext& context) {
model_path_.clear();
model_path_ = api::utils::parseProperty(context, ModelPath);
multimodal_model_path_ = api::utils::parseOptionalProperty(context, MultiModalModelPath);
system_prompt_ = context.getProperty(SystemPrompt).value_or("");
output_attribute_ = api::utils::parseOptionalProperty(context, OutputAttributeName);

LlamaSamplerParams llama_sampler_params;
llama_sampler_params.temperature = api::utils::parseOptionalFloatProperty(context, Temperature);
Expand All @@ -53,7 +55,7 @@ MinifiStatus RunLlamaCppInference::onScheduleImpl(api::core::ProcessContext& con
if (llama_context_provider_) {
llama_ctx_ = llama_context_provider_(model_path_, llama_sampler_params, llama_ctx_params);
} else {
llama_ctx_ = std::make_unique<DefaultLlamaContext>(model_path_, llama_sampler_params, llama_ctx_params);
llama_ctx_ = std::make_unique<DefaultLlamaContext>(model_path_, multimodal_model_path_, llama_sampler_params, llama_ctx_params, logger_);
}

return MINIFI_STATUS_SUCCESS;
Expand All @@ -76,10 +78,16 @@ MinifiStatus RunLlamaCppInference::onTriggerImpl(api::core::ProcessContext& cont
auto prompt = context.getProperty(Prompt, &flow_file).value_or("");

auto read_result = session.readBuffer(flow_file);
std::vector<std::vector<std::byte>> files;
std::string input_data_and_prompt;
if (!read_result.empty()) {
input_data_and_prompt.append("Input data (or flow file content):\n");
input_data_and_prompt.append({reinterpret_cast<const char*>(read_result.data()), read_result.size()});
if (multimodal_model_path_) {
input_data_and_prompt.append(mtmd_default_marker());
files.push_back(std::move(read_result));
} else {
input_data_and_prompt.append({reinterpret_cast<const char*>(read_result.data()), read_result.size()});
}
input_data_and_prompt.append("\n\n");
}
input_data_and_prompt.append(prompt);
Expand Down Expand Up @@ -111,7 +119,7 @@ MinifiStatus RunLlamaCppInference::onTriggerImpl(api::core::ProcessContext& cont
auto start_time = std::chrono::steady_clock::now();

std::string text;
auto generation_result = llama_ctx_->generate(*input, [&] (std::string_view token) {
auto generation_result = llama_ctx_->generate(*input, files, [&] (std::string_view token) {
text += token;
});

Expand All @@ -133,7 +141,12 @@ MinifiStatus RunLlamaCppInference::onTriggerImpl(api::core::ProcessContext& cont
session.setAttribute(flow_file, LlamaCppTimeToFirstToken.name, std::to_string(generation_result->time_to_first_token.count()) + " ms");
session.setAttribute(flow_file, LlamaCppTokensPerSecond.name, fmt::format("{:.2f}", generation_result->tokens_per_second));

session.writeBuffer(flow_file, text);
if (output_attribute_) {
session.setAttribute(flow_file, output_attribute_.value(), text);
} else {
session.writeBuffer(flow_file, text);
}

session.transfer(std::move(flow_file), Success);

return MINIFI_STATUS_SUCCESS;
Expand Down
10 changes: 10 additions & 0 deletions extensions/llamacpp/processors/RunLlamaCppInference.h
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,12 @@ class RunLlamaCppInference : public api::core::ProcessorImpl {
.withDescription("The filesystem path of the model file in gguf format.")
.isRequired(true)
.build();
EXTENSIONAPI static constexpr auto OutputAttributeName = core::PropertyDefinitionBuilder<>::createProperty("Output Attribute Name")
.withDescription("Specify the attribute to use as output, if not provided, the content is overridden instead.")
.build();
EXTENSIONAPI static constexpr auto MultiModalModelPath = core::PropertyDefinitionBuilder<>::createProperty("MultiModal Model Path")
.withDescription("The filesystem path of the multimodal model (visual, audio) file in gguf format.")
.build();
EXTENSIONAPI static constexpr auto Temperature = core::PropertyDefinitionBuilder<>::createProperty("Temperature")
.withDescription("The temperature to use for sampling.")
.withDefaultValue("0.8")
Expand Down Expand Up @@ -128,6 +134,8 @@ class RunLlamaCppInference : public api::core::ProcessorImpl {

EXTENSIONAPI static constexpr auto Properties = std::to_array<core::PropertyReference>({
ModelPath,
OutputAttributeName,
MultiModalModelPath,
Temperature,
TopK,
TopP,
Expand Down Expand Up @@ -167,7 +175,9 @@ class RunLlamaCppInference : public api::core::ProcessorImpl {
void increaseTokensOut(uint64_t token_count);

std::string model_path_;
std::optional<std::string> multimodal_model_path_;
std::string system_prompt_;
std::optional<std::string> output_attribute_;

LlamaContextProvider llama_context_provider_;
std::unique_ptr<LlamaContext> llama_ctx_;
Expand Down
2 changes: 1 addition & 1 deletion extensions/llamacpp/tests/RunLlamaCppInferenceTests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ class MockLlamaContext : public processors::LlamaContext {
return "Test input";
}

nonstd::expected<processors::GenerationResult, std::string> generate(const std::string& input, std::function<void(std::string_view/*token*/)> token_handler) override {
nonstd::expected<processors::GenerationResult, std::string> generate(const std::string& input, const std::vector<std::vector<std::byte>>& /*files*/, std::function<void(std::string_view/*token*/)> token_handler) override {
if (fail_generation_) {
return nonstd::make_unexpected("Generation failed");
}
Expand Down
Loading
Loading