apache · adamdebreceni · Feb 17, 2026 · Feb 18, 2026
diff --git a/cmake/LlamaCpp.cmake b/cmake/LlamaCpp.cmake
@@ -21,6 +21,7 @@ set(BUILD_SHARED_LIBS "OFF" CACHE STRING "" FORCE)
 set(LLAMA_BUILD_TESTS "OFF" CACHE STRING "" FORCE)
 set(LLAMA_BUILD_EXAMPLES "OFF" CACHE STRING "" FORCE)
 set(LLAMA_BUILD_SERVER "OFF" CACHE STRING "" FORCE)
+set(LLAMA_BUILD_COMMON "ON" CACHE STRING "" FORCE)
 set(GGML_OPENMP "OFF" CACHE STRING "" FORCE)
 set(GGML_METAL "OFF" CACHE STRING "" FORCE)
 set(GGML_BLAS "OFF" CACHE STRING "" FORCE)
@@ -30,16 +31,15 @@ else()
     set(GGML_NATIVE "ON" CACHE STRING "" FORCE)
 endif()
 
-set(PATCH_FILE_1 "${CMAKE_SOURCE_DIR}/thirdparty/llamacpp/lu8_macro_fix.patch")  # https://github.com/ggml-org/llama.cpp/issues/12740
-set(PATCH_FILE_2 "${CMAKE_SOURCE_DIR}/thirdparty/llamacpp/cpp-23-fixes.patch")
+set(PATCH_FILE_1 "${CMAKE_SOURCE_DIR}/thirdparty/llamacpp/mtmd-fix.patch")
 
 set(PC ${Bash_EXECUTABLE}  -c "set -x &&\
-            (\\\"${Patch_EXECUTABLE}\\\" -p1 -R -s -f --dry-run -i \\\"${PATCH_FILE_1}\\\" || \\\"${Patch_EXECUTABLE}\\\" -p1 -N -i \\\"${PATCH_FILE_1}\\\") &&\
-            (\\\"${Patch_EXECUTABLE}\\\" -p1 -R -s -f --dry-run -i \\\"${PATCH_FILE_2}\\\" || \\\"${Patch_EXECUTABLE}\\\" -p1 -N -i \\\"${PATCH_FILE_2}\\\")")
+            (\\\"${Patch_EXECUTABLE}\\\" -p1 -R -s -f --dry-run -i \\\"${PATCH_FILE_1}\\\" || \\\"${Patch_EXECUTABLE}\\\" -p1 -N -i \\\"${PATCH_FILE_1}\\\")")
+
 
 FetchContent_Declare(llamacpp
-        URL https://github.com/ggml-org/llama.cpp/archive/refs/tags/b5958.tar.gz
-        URL_HASH SHA256=4e8a2abd83092aa446cd13556f6fe8777139da7b191bdaa0e1b79fe9740b36a6
+        URL https://github.com/ggml-org/llama.cpp/archive/refs/tags/b7836.tar.gz
+        URL_HASH SHA256=3d384e7e8b3bc3cd31abddedf684a6e201405c1d932cafb3c4a5277d872b0614
         PATCH_COMMAND "${PC}"
         SYSTEM
 )
@@ -49,5 +49,6 @@ FetchContent_MakeAvailable(llamacpp)
 set(LLAMACPP_INCLUDE_DIRS
     "${llamacpp_SOURCE_DIR}/include"
     "${llamacpp_SOURCE_DIR}/ggml/include"
+    "${llamacpp_SOURCE_DIR}/tools"
     CACHE STRING "" FORCE
 )
diff --git a/extensions/llamacpp/CMakeLists.txt b/extensions/llamacpp/CMakeLists.txt
@@ -31,7 +31,7 @@ add_minifi_library(minifi-llamacpp SHARED ${SOURCES})
 target_include_directories(minifi-llamacpp PUBLIC "${CMAKE_SOURCE_DIR}/extensions/llamacpp")
 target_include_directories(minifi-llamacpp PUBLIC "${LLAMACPP_INCLUDE_DIRS}")
 
-target_link_libraries(minifi-llamacpp minifi-cpp-extension-lib llama)
+target_link_libraries(minifi-llamacpp minifi-cpp-extension-lib llama mtmd)
 
 register_c_api_extension(minifi-llamacpp "LLAMACPP EXTENSION" LLAMACPP-EXTENSION "Provides llama.cpp support" "extensions/llamacpp/tests")
 
diff --git a/extensions/llamacpp/processors/DefaultLlamaContext.cpp b/extensions/llamacpp/processors/DefaultLlamaContext.cpp
@@ -16,8 +16,12 @@
  */
 
 #include "DefaultLlamaContext.h"
+
+#include <ranges>
+
 #include "minifi-cpp/Exception.h"
 #include "fmt/format.h"
+#include "mtmd/mtmd-helper.h"
 
 namespace org::apache::nifi::minifi::extensions::llamacpp::processors {
 
@@ -41,7 +45,8 @@ constexpr size_t DEFAULT_BUFFER_SIZE = 4096;
 }  // namespace
 
 
-DefaultLlamaContext::DefaultLlamaContext(const std::filesystem::path& model_path, const LlamaSamplerParams& llama_sampler_params, const LlamaContextParams& llama_ctx_params) {
+DefaultLlamaContext::DefaultLlamaContext(const std::filesystem::path& model_path, const std::optional<std::filesystem::path>& multimodal_model_path,
+    const LlamaSamplerParams& llama_sampler_params, const LlamaContextParams& llama_ctx_params, const std::shared_ptr<core::logging::Logger>& logger) {
   llama_model_ = llama_model_load_from_file(model_path.string().c_str(), llama_model_default_params());  // NOLINT(cppcoreguidelines-prefer-member-initializer)
   if (!llama_model_) {
     throw Exception(ExceptionType::PROCESS_SCHEDULE_EXCEPTION, fmt::format("Failed to load model from '{}'", model_path.string()));
@@ -54,7 +59,7 @@ DefaultLlamaContext::DefaultLlamaContext(const std::filesystem::path& model_path
   ctx_params.n_seq_max = llama_ctx_params.n_seq_max;
   ctx_params.n_threads = llama_ctx_params.n_threads;
   ctx_params.n_threads_batch = llama_ctx_params.n_threads_batch;
-  ctx_params.flash_attn = false;
+  ctx_params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_DISABLED;
   llama_ctx_ = llama_init_from_model(llama_model_, ctx_params);
 
   auto sparams = llama_sampler_chain_default_params();
@@ -73,9 +78,27 @@ DefaultLlamaContext::DefaultLlamaContext(const std::filesystem::path& model_path
     llama_sampler_chain_add(llama_sampler_, llama_sampler_init_temp(*llama_sampler_params.temperature));
   }
   llama_sampler_chain_add(llama_sampler_, llama_sampler_init_dist(LLAMA_DEFAULT_SEED));
+
+  if (!multimodal_model_path) {
+    logger->log_info("No multimodal model path provided");
+    return;
+  }
+
+  mtmd_context_params mparams = mtmd_context_params_default();
+  mparams.use_gpu = false;
+  mparams.flash_attn_type  = LLAMA_FLASH_ATTN_TYPE_DISABLED;
+
+  multimodal_ctx_ = mtmd_init_from_file(multimodal_model_path->string().c_str(), llama_model_, mparams);
+  if (!multimodal_ctx_) {
+    throw Exception(ExceptionType::PROCESS_SCHEDULE_EXCEPTION, fmt::format("Failed to load multimodal model from '{}'", multimodal_model_path->string()));
+  }
+
+  logger->log_info("Successfully loaded multimodal model from '{}'", multimodal_model_path->string());
 }
 
 DefaultLlamaContext::~DefaultLlamaContext() {
+  mtmd_free(multimodal_ctx_);
+  multimodal_ctx_ = nullptr;
   llama_sampler_free(llama_sampler_);
   llama_sampler_ = nullptr;
   llama_free(llama_ctx_);
@@ -108,24 +131,100 @@ std::optional<std::string> DefaultLlamaContext::applyTemplate(const std::vector<
   return text;
 }
 
-nonstd::expected<GenerationResult, std::string> DefaultLlamaContext::generate(const std::string& input, std::function<void(std::string_view/*token*/)> token_handler) {
+namespace {
+
+struct mtmd_bitmap_deleter {
+  void operator()(mtmd_bitmap* val) { mtmd_bitmap_free(val); }
+};
+using unique_bitmap_ptr = std::unique_ptr<mtmd_bitmap, mtmd_bitmap_deleter>;
+
+struct mtmd_input_chunks_deleter {
+  void operator()(mtmd_input_chunks* val) { mtmd_input_chunks_free(val); }
+};
+using unique_mtmd_input_chunks_ptr = std::unique_ptr<mtmd_input_chunks, mtmd_input_chunks_deleter>;
+
+class unique_llama_batch {
+ public:
+  explicit unique_llama_batch(std::optional<llama_batch> batch = std::nullopt): batch_(std::move(batch)) {}
+
+  unique_llama_batch(unique_llama_batch&&) = default;
+  unique_llama_batch& operator=(unique_llama_batch&&) = default;
+  unique_llama_batch(const unique_llama_batch&) = delete;
+  unique_llama_batch& operator=(const unique_llama_batch&) = delete;
+
+  std::optional<llama_batch> get() const {
+    return batch_;
+  }
+
+  std::optional<llama_batch>& operator->() {
+    return batch_;
+  }
+
+  void reset(std::optional<llama_batch> batch = std::nullopt) {
+    if (batch_) {
+      llama_batch_free(batch_.value());
+    }
+    batch_ = std::move(batch);
+  }
+
+  ~unique_llama_batch() {
+    if (batch_) {
+      llama_batch_free(batch_.value());
+    }
+    batch_.reset();
+  }
+
+ private:
+  std::optional<llama_batch> batch_;
+};
+
+}  // namespace
+
+nonstd::expected<GenerationResult, std::string> DefaultLlamaContext::generate(const std::string& prompt, const std::vector<std::vector<std::byte>>& files, std::function<void(std::string_view/*token*/)> token_handler) {
   GenerationResult result{};
   auto start_time = std::chrono::steady_clock::now();
+  llama_memory_seq_rm(llama_get_memory(llama_ctx_), 0, -1, -1);
   const llama_vocab * vocab = llama_model_get_vocab(llama_model_);
-  std::vector<llama_token> tokenized_input = tokenizeInput(vocab, input);
-  result.num_tokens_in = gsl::narrow<uint64_t>(tokenized_input.size());
+  llama_pos n_past = 0;
+  std::vector<llama_token> tokenized_input;
+  unique_llama_batch batch;
+  int32_t decode_status = 0;
+  if (multimodal_ctx_) {
+    gsl_Assert(!files.empty());
+    std::vector<unique_bitmap_ptr> bitmaps;
+    for (auto& file : files) {
+      unique_bitmap_ptr bitmap{mtmd_helper_bitmap_init_from_buf(multimodal_ctx_, reinterpret_cast<const unsigned char*>(file.data()), file.size())};
+      if (!bitmap) {
+        throw Exception(PROCESSOR_EXCEPTION, "Failed to create multimodal bitmap from buffer");
+      }
+      bitmaps.push_back(std::move(bitmap));
+    }
+    mtmd_input_text inp_txt = {
+      .text = prompt.c_str(),
+      .add_special = true,
+      .parse_special = true,
+    };
+    unique_mtmd_input_chunks_ptr chunks{mtmd_input_chunks_init()};
+    auto bitmap_c_ptrs = bitmaps | std::views::transform([] (auto& ptr) {return ptr.get();}) | std::ranges::to<std::vector<const mtmd_bitmap*>>();
+    auto tokenized = mtmd_tokenize(multimodal_ctx_, chunks.get(), &inp_txt, bitmap_c_ptrs.data(), bitmap_c_ptrs.size());
+    if (tokenized != 0) {
+      throw Exception(PROCESSOR_EXCEPTION, fmt::format("Failed to tokenize multimodal prompt, error: {}", tokenized));
+    }
+    auto status = mtmd_helper_eval_chunks(multimodal_ctx_, llama_ctx_, chunks.get(), 0, 0, 1, true, &n_past);
+    if (status != 0) {
+      throw Exception(PROCESSOR_EXCEPTION, fmt::format("Failed to eval multimodal chunks, error: {}", status));
+    }
+  } else {
+    gsl_Assert(files.empty());
+    tokenized_input = tokenizeInput(vocab, prompt);
+    n_past = gsl::narrow<llama_pos>(tokenized_input.size());
+    result.num_tokens_in = gsl::narrow<uint64_t>(tokenized_input.size());
+    decode_status = llama_decode(llama_ctx_, llama_batch_get_one(tokenized_input.data(), n_past));
+  }
 
-  llama_batch batch = llama_batch_get_one(tokenized_input.data(), gsl::narrow<int32_t>(tokenized_input.size()));
   llama_token new_token_id = 0;
   bool first_token_generated = false;
-  while (true) {
-    int32_t res = llama_decode(llama_ctx_, batch);
-    if (res == 1) {
-      return nonstd::make_unexpected("Could not find a KV slot for the batch (try reducing the size of the batch or increase the context)");
-    } else if (res < 0) {
-      return nonstd::make_unexpected("Error occurred while executing llama decode");
-    }
-
+  while (decode_status == 0) {
     new_token_id = llama_sampler_sample(llama_sampler_, llama_ctx_, -1);
     if (!first_token_generated) {
       result.time_to_first_token = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::steady_clock::now() - start_time);
@@ -147,8 +246,27 @@ nonstd::expected<GenerationResult, std::string> DefaultLlamaContext::generate(co
     gsl_Assert(len < 128);
 
     std::string_view token_str{buf.data(), gsl::narrow<std::string_view::size_type>(len)};
-    batch = llama_batch_get_one(&new_token_id, 1);
+    batch.reset(llama_batch_init(1, 0, 1));
+    batch->n_tokens = 1;
+    batch->token[0] = new_token_id;
+    batch->pos[0] = n_past;
+    batch->n_seq_id[0] = 1;
+    batch->seq_id[0][0] = 0;
+    batch->logits[0] = true;
+    ++n_past;
     token_handler(token_str);
+
+    decode_status = llama_decode(llama_ctx_, batch.get().value());
+  }
+
+  if (decode_status == 1) {
+    return nonstd::make_unexpected("Could not find a KV slot for the batch (try reducing the size of the batch or increase the context)");
+  }
+  if (decode_status == 2) {
+    return nonstd::make_unexpected("Llama decode aborted");
+  }
+  if (decode_status < 0) {
+    return nonstd::make_unexpected("Error occurred while executing llama decode");
   }
 
   result.tokens_per_second =

diff --git a/extensions/llamacpp/processors/DefaultLlamaContext.h b/extensions/llamacpp/processors/DefaultLlamaContext.h
@@ -19,25 +19,29 @@
 #include "LlamaContext.h"
 #include "llama.h"
 #include "LlamaBackendInitializer.h"
+#include "mtmd/mtmd.h"
+#include "minifi-cpp/core/logging/Logger.h"
 
 namespace org::apache::nifi::minifi::extensions::llamacpp::processors {
 
 class DefaultLlamaContext : public LlamaContext {
  public:
-  DefaultLlamaContext(const std::filesystem::path& model_path, const LlamaSamplerParams& llama_sampler_params, const LlamaContextParams& llama_ctx_params);
+  DefaultLlamaContext(const std::filesystem::path& model_path, const std::optional<std::filesystem::path>& multimodal_model_path,
+      const LlamaSamplerParams& llama_sampler_params, const LlamaContextParams& llama_ctx_params, const std::shared_ptr<core::logging::Logger>& logger);
   DefaultLlamaContext(const DefaultLlamaContext&) = delete;
   DefaultLlamaContext(DefaultLlamaContext&&) = delete;
   DefaultLlamaContext& operator=(const DefaultLlamaContext&) = delete;
   DefaultLlamaContext& operator=(DefaultLlamaContext&&) = delete;
   ~DefaultLlamaContext() override;
 
   std::optional<std::string> applyTemplate(const std::vector<LlamaChatMessage>& messages) override;
-  nonstd::expected<GenerationResult, std::string> generate(const std::string& input, std::function<void(std::string_view/*token*/)> token_handler) override;
+  nonstd::expected<GenerationResult, std::string> generate(const std::string& prompt, const std::vector<std::vector<std::byte>>& files, std::function<void(std::string_view/*token*/)> token_handler) override;
 
  private:
   const LlamaBackendInitializer& llama_context_initializer_ = LlamaBackendInitializer::get();
   llama_model* llama_model_{};
   llama_context* llama_ctx_{};
+  mtmd_context* multimodal_ctx_{};
   llama_sampler* llama_sampler_{};
 };
 

diff --git a/extensions/llamacpp/processors/LlamaContext.h b/extensions/llamacpp/processors/LlamaContext.h
@@ -59,7 +59,7 @@ struct GenerationResult {
 class LlamaContext {
  public:
   virtual std::optional<std::string> applyTemplate(const std::vector<LlamaChatMessage>& messages) = 0;
-  virtual nonstd::expected<GenerationResult, std::string> generate(const std::string& input, std::function<void(std::string_view/*token*/)> token_handler) = 0;
+  virtual nonstd::expected<GenerationResult, std::string> generate(const std::string& prompt, const std::vector<std::vector<std::byte>>& files, std::function<void(std::string_view/*token*/)> token_handler) = 0;
   virtual ~LlamaContext() = default;
 };
 

diff --git a/extensions/llamacpp/processors/RunLlamaCppInference.cpp b/extensions/llamacpp/processors/RunLlamaCppInference.cpp
@@ -31,7 +31,9 @@ namespace org::apache::nifi::minifi::extensions::llamacpp::processors {
 MinifiStatus RunLlamaCppInference::onScheduleImpl(api::core::ProcessContext& context) {
   model_path_.clear();
   model_path_ = api::utils::parseProperty(context, ModelPath);
+  multimodal_model_path_ = api::utils::parseOptionalProperty(context, MultiModalModelPath);
   system_prompt_ = context.getProperty(SystemPrompt).value_or("");
+  output_attribute_ = api::utils::parseOptionalProperty(context, OutputAttributeName);
 
   LlamaSamplerParams llama_sampler_params;
   llama_sampler_params.temperature = api::utils::parseOptionalFloatProperty(context, Temperature);
@@ -53,7 +55,7 @@ MinifiStatus RunLlamaCppInference::onScheduleImpl(api::core::ProcessContext& con
   if (llama_context_provider_) {
     llama_ctx_ = llama_context_provider_(model_path_, llama_sampler_params, llama_ctx_params);
   } else {
-    llama_ctx_ = std::make_unique<DefaultLlamaContext>(model_path_, llama_sampler_params, llama_ctx_params);
+    llama_ctx_ = std::make_unique<DefaultLlamaContext>(model_path_, multimodal_model_path_, llama_sampler_params, llama_ctx_params, logger_);
   }
 
   return MINIFI_STATUS_SUCCESS;
@@ -76,10 +78,16 @@ MinifiStatus RunLlamaCppInference::onTriggerImpl(api::core::ProcessContext& cont
   auto prompt = context.getProperty(Prompt, &flow_file).value_or("");
 
   auto read_result = session.readBuffer(flow_file);
+  std::vector<std::vector<std::byte>> files;
   std::string input_data_and_prompt;
   if (!read_result.empty()) {
     input_data_and_prompt.append("Input data (or flow file content):\n");
-    input_data_and_prompt.append({reinterpret_cast<const char*>(read_result.data()), read_result.size()});
+    if (multimodal_model_path_) {
+      input_data_and_prompt.append(mtmd_default_marker());
+      files.push_back(std::move(read_result));
+    } else {
+      input_data_and_prompt.append({reinterpret_cast<const char*>(read_result.data()), read_result.size()});
+    }
     input_data_and_prompt.append("\n\n");
   }
   input_data_and_prompt.append(prompt);
@@ -111,7 +119,7 @@ MinifiStatus RunLlamaCppInference::onTriggerImpl(api::core::ProcessContext& cont
   auto start_time = std::chrono::steady_clock::now();
 
   std::string text;
-  auto generation_result = llama_ctx_->generate(*input, [&] (std::string_view token) {
+  auto generation_result = llama_ctx_->generate(*input, files, [&] (std::string_view token) {
     text += token;
   });
 
@@ -133,7 +141,12 @@ MinifiStatus RunLlamaCppInference::onTriggerImpl(api::core::ProcessContext& cont
   session.setAttribute(flow_file, LlamaCppTimeToFirstToken.name, std::to_string(generation_result->time_to_first_token.count()) + " ms");
   session.setAttribute(flow_file, LlamaCppTokensPerSecond.name, fmt::format("{:.2f}", generation_result->tokens_per_second));
 
-  session.writeBuffer(flow_file, text);
+  if (output_attribute_) {
+    session.setAttribute(flow_file, output_attribute_.value(), text);
+  } else {
+    session.writeBuffer(flow_file, text);
+  }
+
   session.transfer(std::move(flow_file), Success);
 
   return MINIFI_STATUS_SUCCESS;

diff --git a/extensions/llamacpp/processors/RunLlamaCppInference.h b/extensions/llamacpp/processors/RunLlamaCppInference.h
@@ -59,6 +59,12 @@ class RunLlamaCppInference : public api::core::ProcessorImpl {
       .withDescription("The filesystem path of the model file in gguf format.")
       .isRequired(true)
       .build();
+  EXTENSIONAPI static constexpr auto OutputAttributeName = core::PropertyDefinitionBuilder<>::createProperty("Output Attribute Name")
+      .withDescription("Specify the attribute to use as output, if not provided, the content is overridden instead.")
+      .build();
+  EXTENSIONAPI static constexpr auto MultiModalModelPath = core::PropertyDefinitionBuilder<>::createProperty("MultiModal Model Path")
+      .withDescription("The filesystem path of the multimodal model (visual, audio) file in gguf format.")
+      .build();
   EXTENSIONAPI static constexpr auto Temperature = core::PropertyDefinitionBuilder<>::createProperty("Temperature")
       .withDescription("The temperature to use for sampling.")
       .withDefaultValue("0.8")
@@ -128,6 +134,8 @@ class RunLlamaCppInference : public api::core::ProcessorImpl {
 
   EXTENSIONAPI static constexpr auto Properties = std::to_array<core::PropertyReference>({
     ModelPath,
+    OutputAttributeName,
+    MultiModalModelPath,
     Temperature,
     TopK,
     TopP,
@@ -167,7 +175,9 @@ class RunLlamaCppInference : public api::core::ProcessorImpl {
   void increaseTokensOut(uint64_t token_count);
 
   std::string model_path_;
+  std::optional<std::string> multimodal_model_path_;
   std::string system_prompt_;
+  std::optional<std::string> output_attribute_;
 
   LlamaContextProvider llama_context_provider_;
   std::unique_ptr<LlamaContext> llama_ctx_;

diff --git a/extensions/llamacpp/tests/RunLlamaCppInferenceTests.cpp b/extensions/llamacpp/tests/RunLlamaCppInferenceTests.cpp
@@ -37,7 +37,7 @@ class MockLlamaContext : public processors::LlamaContext {
     return "Test input";
   }
 
-  nonstd::expected<processors::GenerationResult, std::string> generate(const std::string& input, std::function<void(std::string_view/*token*/)> token_handler) override {
+  nonstd::expected<processors::GenerationResult, std::string> generate(const std::string& input, const std::vector<std::vector<std::byte>>& /*files*/, std::function<void(std::string_view/*token*/)> token_handler) override {
     if (fail_generation_) {
       return nonstd::make_unexpected("Generation failed");
     }