Skip to content
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ class ZeroInferRequest final : public SyncInferRequest {
void set_tensor(const ov::Output<const ov::Node>& port, const ov::SoPtr<ov::ITensor>& tensor) override;
void set_tensors(const ov::Output<const ov::Node>& port,
const std::vector<ov::SoPtr<ov::ITensor>>& tensors) override;
void set_tensors(const std::map<ov::Output<const ov::Node>, ov::SoPtr<ov::ITensor>>& ports_tensors);

void infer() override;
void infer_async() override;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ struct Pipeline final {
void reset() const;

void update_graph_arguments(uint32_t index, const std::shared_ptr<ZeroTensor>& tensor);
void update_graph_arguments(const std::vector<std::pair<uint32_t, std::shared_ptr<ZeroTensor>>>& tensors);
void update_graph_arguments(uint32_t index, const std::shared_ptr<ZeroTensor>& tensor, size_t batch_index);

std::vector<ov::ProfilingInfo> get_profiling_info() const;
Expand Down
141 changes: 141 additions & 0 deletions src/plugins/intel_npu/src/backend/src/zero_infer_request.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -476,6 +476,147 @@ void ZeroInferRequest::set_tensors(const ov::Output<const ov::Node>& port,
// If command list updates are not supported, fallback to copying tensors every time.
}

void ZeroInferRequest::set_tensors(const std::map<ov::Output<const ov::Node>, ov::SoPtr<ov::ITensor>>& ports_tensors) {
OV_ITT_TASK_CHAIN(ZERO_SET_TENSOR, itt::domains::LevelZeroBackend, "set_tensors", "set_tensors");
bool updateCommandListArg = false;
std::vector<std::pair<uint32_t, std::shared_ptr<ZeroTensor>>> zeroTensors(ports_tensors.size(), {0, nullptr});
size_t ioIndex = 0;
for (const auto& [port, tensor] : ports_tensors) {
auto foundPort = find_port(port);
OPENVINO_ASSERT(foundPort.found(), "Cannot find tensor for port ", port);
try {
check_tensor(port,
tensor,
foundPort.is_input() ? _metadata.inputs.at(foundPort.idx).supportsStridedLayout
: _metadata.outputs.at(foundPort.idx).supportsStridedLayout);
} catch (const ov::Exception& ex) {
OPENVINO_THROW("Failed to set tensor. ", ex.what());
}

if (foundPort.is_input()) {
if (get_user_input(foundPort.idx)._ptr == tensor._ptr) {
// set_tensor called with the same tensor object; no action needed
_logger.debug("ZeroInferRequest::set_tensor - got the same tensor, do nothing");
return;
}

const auto& ioShape = _compiledModel->inputs()[foundPort.idx].get_partial_shape();
auto batchSizeCandidate =
determine_dynamic_batch_size(_metadata.inputs.at(foundPort.idx), ioShape, tensor._ptr, std::nullopt);

if (batchSizeCandidate.has_value()) {
if (!_dynamicBatchValueChanged) {
if (get_user_input(foundPort.idx)._ptr != nullptr &&
get_user_input(foundPort.idx)->get_byte_size() * get_user_inputs(foundPort.idx).size() !=
tensor->get_byte_size()) {
_dynamicBatchValueChanged = true;
_graph->set_batch_size(batchSizeCandidate.value());
} else if (_graph->get_batch_size().has_value()) {
if (batchSizeCandidate.value() != _graph->get_batch_size().value()) {
_dynamicBatchValueChanged = true;
_graph->set_batch_size(batchSizeCandidate.value());
}
} else {
_graph->set_batch_size(batchSizeCandidate.value());
}
} else if (batchSizeCandidate.value() != _graph->get_batch_size().value()) {
OPENVINO_THROW("Batching size is not matching all the tensors.");
}
}

if (is_batched_input(foundPort.idx)) {
// Reset vector size to 1 if set_tensor is called after set_tensors
get_level_zero_inputs(foundPort.idx).resize(1);
get_level_zero_inputs(foundPort.idx).shrink_to_fit();
get_level_zero_input(foundPort.idx) = {};
get_user_inputs(foundPort.idx).resize(1);
get_user_inputs(foundPort.idx).shrink_to_fit();
get_user_input(foundPort.idx) = {};
}

get_user_input(foundPort.idx) = tensor;
} else {
if (_userOutputTensors.at(foundPort.idx)._ptr == tensor._ptr) {
// set_tensor called with the same tensor object; no action needed
_logger.debug("ZeroInferRequest::set_tensor - got the same tensor, do nothing");
return;
}

const auto& ioShape = _compiledModel->outputs()[foundPort.idx].get_partial_shape();
auto batchSizeCandidate =
determine_dynamic_batch_size(_metadata.outputs.at(foundPort.idx), ioShape, tensor._ptr, std::nullopt);

if (batchSizeCandidate.has_value()) {
if (!_dynamicBatchValueChanged) {
if (_userOutputTensors.at(foundPort.idx)._ptr != nullptr &&
_userOutputTensors.at(foundPort.idx)->get_byte_size() != tensor->get_byte_size()) {
_dynamicBatchValueChanged = true;
_graph->set_batch_size(batchSizeCandidate.value());
} else if (_graph->get_batch_size().has_value()) {
if (batchSizeCandidate.value() != _graph->get_batch_size().value()) {
_dynamicBatchValueChanged = true;
_graph->set_batch_size(batchSizeCandidate.value());
}
} else {
_graph->set_batch_size(batchSizeCandidate.value());
}
} else if (batchSizeCandidate.value() != _graph->get_batch_size().value()) {
OPENVINO_THROW("Batching size is not matching all the tensors.");
}
}

_userOutputTensors.at(foundPort.idx) = tensor;
}

if (_initStructs->getMutableCommandListExtVersion() >= ZE_MAKE_VERSION(1, 0)) {
auto& levelZeroTensor =
foundPort.is_input() ? get_level_zero_input(foundPort.idx) : _levelZeroOutputTensors.at(foundPort.idx);

try {
_logger.debug("ZeroInferRequest::set_tensor - create zero tensor");
OV_ITT_TASK_NEXT(ZERO_SET_TENSOR, "create zero tensor");
// Try to use the user tensor directly if its underlying data is already allocated in the same Level
// Zero context.
levelZeroTensor = std::make_shared<ZeroTensor>(_initStructs, _config, tensor);
updateCommandListArg = true;
} catch (const ZeroMemException& exception) {
_logger.debug(
"ZeroInferRequest::set_tensor - exception caught while trying to create a Level Zero tensor "
"from the user tensor: %s",
exception.what());

// Check if the current Level Zero tensor was previously shared with the user. If so, it cannot be
// reused; allocate a new tensor to back up the user tensor (which cannot be imported or used directly).
if (_dynamicBatchValueChanged || levelZeroTensor == nullptr || !levelZeroTensor->can_be_reused()) {
_logger.debug("ZeroInferRequest::set_tensor - allocate locally L0 tensor");
OV_ITT_TASK_NEXT(ZERO_SET_TENSOR, "allocate tensor");

auto batch = _graph->get_batch_size();
levelZeroTensor = allocate_tensor(foundPort.idx, foundPort.is_input(), batch);
updateCommandListArg = true;
} else {
_logger.debug("ZeroInferRequest::set_tensor - reusing the level zero tensor since it is not shared "
"with the user");
}
}
if (_pipelineIsCreated && updateCommandListArg && !_dynamicBatchValueChanged) {
OPENVINO_ASSERT(levelZeroTensor->data(), "Empty buffer");
}
zeroTensors.at(ioIndex++) =
std::make_pair(foundPort.is_input() ? _metadata.inputs.at(foundPort.idx).indexUsedByDriver
: _metadata.outputs.at(foundPort.idx).indexUsedByDriver,
levelZeroTensor);
}
// If command list updates are not supported, fallback to copying tensors every time.
}
if (_pipelineIsCreated && updateCommandListArg && !_dynamicBatchValueChanged) {
_logger.debug("ZeroInferRequest::infer_async - update command list");

OV_ITT_TASK_NEXT(ZERO_SET_TENSOR, "update_graph_arguments");
_pipeline->update_graph_arguments(zeroTensors);
}
}

ov::SoPtr<ov::ITensor> ZeroInferRequest::get_tensor(const ov::Output<const ov::Node>& port) const {
OV_ITT_SCOPED_TASK(itt::domains::LevelZeroBackend, "get_tensor");

Expand Down
35 changes: 35 additions & 0 deletions src/plugins/intel_npu/src/backend/src/zero_pipeline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -287,6 +287,41 @@ void Pipeline::update_graph_arguments(uint32_t index, const std::shared_ptr<Zero
}
};

void Pipeline::update_graph_arguments(const std::vector<std::pair<uint32_t, std::shared_ptr<ZeroTensor>>>& tensors) {
OV_ITT_TASK_CHAIN(ZERO_EXECUTOR_IP_UMCL, itt::domains::LevelZeroBackend, "Pipeline", "updateCommandList");
_logger.debug("Pipeline - updateCommandList");

const size_t number_of_command_lists = _command_lists.size();

for (size_t i = 0; i < number_of_command_lists; i++) {
std::vector<std::pair<ze_mutable_graph_argument_exp_desc_t, std::optional<ze_graph_argument_value_strides_t>>>
descs;
descs.reserve(tensors.size());
for (const auto& [index, tensor] : tensors) {
if (tensor->get_element_type().bitwidth() < 8 || tensor->is_continuous() || tensor->get_strides().empty()) {
_command_lists.at(i)->updateMutableCommandList(
index,
static_cast<const unsigned char*>(tensor->data()) +
(i * tensor->get_byte_size()) / number_of_command_lists,
descs);
} else {
_command_lists.at(i)->updateMutableCommandListWithStrides(
index,
static_cast<const unsigned char*>(tensor->data()) + (i * tensor->get_strides()[0]),
get_strides(tensor->get_strides(), tensor->get_element_type().size()),
descs);
}
}
ze_mutable_commands_exp_desc_t mutable_commands_exp_desc_t = {ZE_STRUCTURE_TYPE_MUTABLE_COMMANDS_EXP_DESC,
&descs.at(0).first,
0};

THROW_ON_FAIL_FOR_LEVELZERO(
"zeCommandListUpdateMutableCommandsExp",
zeCommandListUpdateMutableCommandsExp(_command_lists.at(i)->handle(), &mutable_commands_exp_desc_t));
}
};

void Pipeline::update_graph_arguments(uint32_t index, const std::shared_ptr<ZeroTensor>& tensor, size_t batch_index) {
OV_ITT_TASK_CHAIN(ZERO_EXECUTOR_IP_UMCL, itt::domains::LevelZeroBackend, "Pipeline", "updateCommandListIndex");
_logger.debug("Pipeline - updateCommandListIndex");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -73,9 +73,20 @@ class CommandList {
void appendGraphExecute(const ze_graph_handle_t& graph_handle,
const ze_graph_profiling_query_handle_t& profiling_query_handle) const;
void updateMutableCommandList(uint32_t index, const void* data) const;
void updateMutableCommandList(
uint32_t index,
const void* data,
std::vector<std::pair<ze_mutable_graph_argument_exp_desc_t, std::optional<ze_graph_argument_value_strides_t>>>&
descs) const;
void updateMutableCommandListWithStrides(uint32_t index,
const void* data,
const std::vector<size_t>& strides) const;
void updateMutableCommandListWithStrides(
uint32_t index,
const void* data,
const std::vector<size_t>& strides,
std::vector<std::pair<ze_mutable_graph_argument_exp_desc_t, std::optional<ze_graph_argument_value_strides_t>>>&
descs) const;
void appendNpuTimestamp(uint64_t* timestamp_buff) const;
void appendBarrier() const;
void close() const;
Expand Down
67 changes: 67 additions & 0 deletions src/plugins/intel_npu/src/utils/src/zero/zero_wrappers.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,29 @@ void CommandList::updateMutableCommandList(uint32_t index, const void* data) con
THROW_ON_FAIL_FOR_LEVELZERO("zeCommandListUpdateMutableCommandsExp",
zeCommandListUpdateMutableCommandsExp(_handle, &mutable_commands_exp_desc_t));
}
void CommandList::updateMutableCommandList(
uint32_t index,
const void* data,
std::vector<std::pair<ze_mutable_graph_argument_exp_desc_t, std::optional<ze_graph_argument_value_strides_t>>>&
descs) const {
auto desc = ze_mutable_graph_argument_exp_desc_t{};
desc.stype = (_init_structs->getZeDrvApiVersion() >= ZE_MAKE_VERSION(1, 11))
? ZE_STRUCTURE_TYPE_MUTABLE_GRAPH_ARGUMENT_EXP_DESC
: static_cast<ze_structure_type_t>(ZE_STRUCTURE_TYPE_MUTABLE_GRAPH_ARGUMENT_EXP_DESC_DEPRECATED);
desc.commandId = _command_id;
desc.argIndex = index;
desc.pArgValue = data;
auto it = descs.insert(descs.end(), {desc, std::nullopt});

if (it > descs.begin()) {
auto prev_elm = std::prev(it);
if (prev_elm->second != std::nullopt) {
prev_elm->second->pNext = &it->first;
} else {
prev_elm->first.pNext = &it->first;
}
}
}
void CommandList::updateMutableCommandListWithStrides(uint32_t index,
const void* data,
const std::vector<size_t>& strides) const {
Expand Down Expand Up @@ -189,6 +212,50 @@ void CommandList::updateMutableCommandListWithStrides(uint32_t index,
zeCommandListUpdateMutableCommandsExp(_handle, &mutable_commands_exp_desc_t));
}

void CommandList::updateMutableCommandListWithStrides(
uint32_t index,
const void* data,
const std::vector<size_t>& strides,
std::vector<std::pair<ze_mutable_graph_argument_exp_desc_t, std::optional<ze_graph_argument_value_strides_t>>>&
descs) const {
auto desc = ze_mutable_graph_argument_exp_desc_t{};
desc.commandId = _command_id;
desc.argIndex = index;
desc.pArgValue = data;
auto it = descs.insert(descs.end(), {desc, std::nullopt});

if (!strides.empty()) {
if (_init_structs->getGraphDdiTable().version() < ZE_MAKE_VERSION(1, 15)) {
OPENVINO_THROW("Strides are not supported by the current driver version.");
}

if (strides.size() > ZE_MAX_GRAPH_ARGUMENT_DIMENSIONS_SIZE) {
OPENVINO_THROW("The driver does not support strides with more than",
ZE_MAX_GRAPH_ARGUMENT_DIMENSIONS_SIZE,
"dimensions.");
}

it->second = ze_graph_argument_value_strides_t{};
it->second->stype = ZE_STRUCTURE_TYPE_GRAPH_ARGUMENT_STRIDES;
for (size_t i = 0; i < strides.size(); ++i) {
if (strides[i] > std::numeric_limits<uint32_t>::max()) {
OPENVINO_THROW("Stride value exceeds uint32_t range supported by the driver");
}
it->second->userStrides[i] = static_cast<uint32_t>(strides[i]);
}

it->first.pNext = &it->second;
}

if (it > descs.begin()) {
auto prev_elm = std::prev(it);
if (prev_elm->second != std::nullopt) {
prev_elm->second->pNext = &it->first;
} else {
prev_elm->first.pNext = &it->first;
}
}
}
CommandQueue::CommandQueue(const std::shared_ptr<ZeroInitStructsHolder>& init_structs,
const ze_command_queue_priority_t& priority,
const uint32_t group_ordinal,
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
// Copyright (C) 2018-2026 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#include "internal/utils/zero/zero_wrappers.hpp"

#include "common/npu_test_env_cfg.hpp"
#include "common/utils.hpp"
#include "intel_npu/config/options.hpp"
#include "intel_npu/npu_private_properties.hpp"

using namespace ov::test::behavior;

INSTANTIATE_TEST_SUITE_P(compatibility_smoke_BehaviorTest,
ZeroCommandListsTests,
::testing::Values(ov::test::utils::DEVICE_NPU),
ZeroCommandListsTests::getTestCaseName);
Loading