Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 12 additions & 3 deletions onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc
Original file line number Diff line number Diff line change
Expand Up @@ -894,7 +894,7 @@ Status QnnBackendManager::ResetContextPriority() {
return SetContextPriority(context_priority_);
}

Status QnnBackendManager::CreateContext(bool enable_htp_weight_sharing) {
Status QnnBackendManager::CreateContext(bool enable_htp_weight_sharing, bool enable_htp_extended_udma_mode) {
if (true == context_created_) {
LOGS_DEFAULT(INFO) << "Context created already.";
return Status::OK();
Expand All @@ -910,8 +910,16 @@ Status QnnBackendManager::CreateContext(bool enable_htp_weight_sharing) {
QnnContext_Config_t context_priority_config = QNN_CONTEXT_CONFIG_INIT;
ORT_RETURN_IF_ERROR(SetQnnContextConfig(context_priority_, context_priority_config));

QnnContext_Config_t context_config_extended_udma = QNN_CONTEXT_CONFIG_INIT;
QnnHtpContext_CustomConfig_t udma_custom_config;
udma_custom_config.option = QNN_HTP_CONTEXT_CONFIG_OPTION_USE_EXTENDED_UDMA;
udma_custom_config.useExtendedUdma = enable_htp_extended_udma_mode;
context_config_extended_udma.option = QNN_CONTEXT_CONFIG_OPTION_CUSTOM;
context_config_extended_udma.customConfig = &udma_custom_config;

const QnnContext_Config_t* npu_context_configs[] = {&context_priority_config,
&context_config_weight_sharing,
&context_config_extended_udma,
nullptr};

const QnnContext_Config_t* empty_context_configs[] = {nullptr};
Expand Down Expand Up @@ -1225,7 +1233,8 @@ Status QnnBackendManager::SetupBackend(const logging::Logger& logger,
bool need_load_system_lib,
bool share_ep_contexts,
bool enable_vtcm_backup_buffer_sharing,
std::unordered_map<std::string, std::unique_ptr<std::vector<std::string>>>& context_bin_map) {
std::unordered_map<std::string, std::unique_ptr<std::vector<std::string>>>& context_bin_map,
bool enable_htp_extended_udma_mode) {
std::lock_guard<std::recursive_mutex> lock(logger_recursive_mutex_);
if (backend_setup_completed_) {
LOGS(logger, VERBOSE) << "Backend setup already!";
Expand Down Expand Up @@ -1322,7 +1331,7 @@ Status QnnBackendManager::SetupBackend(const logging::Logger& logger,

if (status.IsOK() && (vtcm_backup_buffer_sharing_enabled_ || !load_from_cached_context)) {
status = vtcm_backup_buffer_sharing_enabled_ ? CreateContextVtcmBackupBufferSharingEnabled(context_bin_map)
: CreateContext(enable_htp_weight_sharing);
: CreateContext(enable_htp_weight_sharing, enable_htp_extended_udma_mode);

if (status.IsOK()) {
LOGS(logger, VERBOSE) << "CreateContext succeed.";
Expand Down
5 changes: 3 additions & 2 deletions onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,8 @@ class QnnBackendManager : public std::enable_shared_from_this<QnnBackendManager>
Status SetupBackend(const logging::Logger& logger, bool load_from_cached_context,
bool need_load_system_lib, bool share_ep_contexts,
bool enable_vtcm_backup_buffer_sharing,
std::unordered_map<std::string, std::unique_ptr<std::vector<std::string>>>& context_bin_map);
std::unordered_map<std::string, std::unique_ptr<std::vector<std::string>>>& context_bin_map,
bool enable_htp_extended_udma_mode);

Status CreateHtpPowerCfgId(uint32_t deviceId, uint32_t coreId, uint32_t& htp_power_config_id);

Expand Down Expand Up @@ -254,7 +255,7 @@ class QnnBackendManager : public std::enable_shared_from_this<QnnBackendManager>

Status ReleaseProfilehandle();

Status CreateContext(bool enable_htp_weight_sharing);
Status CreateContext(bool enable_htp_weight_sharing, bool enable_htp_extended_udma_mode);

Status CreateContextVtcmBackupBufferSharingEnabled(std::unordered_map<std::string,
std::unique_ptr<std::vector<std::string>>>& context_bin_map);
Expand Down
16 changes: 15 additions & 1 deletion onnxruntime/core/providers/qnn/qnn_execution_provider.cc
Original file line number Diff line number Diff line change
Expand Up @@ -566,6 +566,19 @@ QNNExecutionProvider::QNNExecutionProvider(const ProviderOptions& provider_optio
}
}

static const std::string QNN_HTP_EXTENDED_UDMA_MODE = "extended_udma";
auto htp_extended_udma_pos = provider_options_map.find(QNN_HTP_EXTENDED_UDMA_MODE);
if (htp_extended_udma_pos != provider_options_map.end()) {
if ("1" == htp_extended_udma_pos->second) {
enable_htp_extended_udma_mode_ = true;
} else if ("0" == htp_extended_udma_pos->second) {
enable_htp_extended_udma_mode_ = false;
} else {
LOGS_DEFAULT(WARNING) << "Invalid enable_htp_extended_udma_mode_ " << enable_htp_extended_udma_mode_ << " only 0 or 1 allowed. Set to 0.";
}
LOGS_DEFAULT(VERBOSE) << "User specified enable_htp_extended_udma_mode_: " << enable_htp_extended_udma_mode_;
}

// Option to skip QNN API interface version check to use other QNN library other than default.
static const std::string SKIP_QNN_VERSION_CHECK = "skip_qnn_version_check";
auto skip_qnn_version_check = ParseBoolOption(SKIP_QNN_VERSION_CHECK, false, provider_options_map);
Expand Down Expand Up @@ -948,7 +961,8 @@ QNNExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_viewer
context_cache_enabled_ && enable_spill_fill_buffer_,
share_ep_contexts_,
enable_vtcm_backup_buffer_sharing_,
context_bin_map);
context_bin_map,
enable_htp_extended_udma_mode_);

context_bin_map.clear();

Expand Down
1 change: 1 addition & 0 deletions onnxruntime/core/providers/qnn/qnn_execution_provider.h
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,7 @@ class QNNExecutionProvider : public IExecutionProvider {
qnn::ModelSettings model_settings_ = {};
bool dump_json_qnn_graph_ = false;
std::string json_qnn_graph_dir_ = "";
bool enable_htp_extended_udma_mode_ = false;

// Whether this is set depends on a session option enabling it and if the RPCMEM dynamic library is available.
// This is potentially shared with HtpSharedMemoryAllocator which may be returned by CreatePreferredAllocators().
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,8 @@ namespace qnnctxgen {
"\t [QNN only] [offload_graph_io_quantization]: Offload graph input quantization and graph output dequantization to another EP (typically CPU EP). \n"
"\t Defaults to '1' (another EP (typically CPU EP) handles the graph I/O quantization and dequantization). \n"
"\t [QNN only] [enable_htp_spill_fill_buffer]: Enable HTP spill file buffer, used while generating QNN context binary.\n"
"\t [QNN only] [extended_udma]: Enable HTP extended UDMA mode for better performance on supported hardware, options: \n"
"\t '0' (disabled), '1' (enabled). Default: '0'. \n"
"\t [Example] -i \"vtcm_mb|8 htp_arch|73\" \n"
"\n"
"\t-h: help\n");
Expand Down Expand Up @@ -165,7 +167,7 @@ static bool ParseSessionConfigs(const std::string& configs_string,
ORT_THROW("Wrong value for htp_graph_finalization_optimization_mode. select from: " + str);
}
} else if (key == "enable_htp_fp16_precision" || key == "offload_graph_io_quantization" ||
key == "enable_htp_spill_fill_buffer") {
key == "enable_htp_spill_fill_buffer" || key == "extended_udma") {
std::unordered_set<std::string> supported_options = {"0", "1"};
if (supported_options.find(value) == supported_options.end()) {
std::ostringstream str_stream;
Expand All @@ -178,7 +180,7 @@ static bool ParseSessionConfigs(const std::string& configs_string,
ORT_THROW(
"Wrong key type entered. Choose from options: ['backend_type', 'backend_path', 'vtcm_mb', "
"'htp_performance_mode', 'htp_graph_finalization_optimization_mode', 'soc_model', 'htp_arch', "
"'enable_htp_fp16_precision', 'offload_graph_io_quantization', 'enable_htp_spill_fill_buffer']");
"'enable_htp_fp16_precision', 'offload_graph_io_quantization', 'enable_htp_spill_fill_buffer', 'extended_udma']");
}

test_config.run_config.provider_options[key] = value;
Expand Down
6 changes: 4 additions & 2 deletions onnxruntime/test/onnx/main.cc
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,8 @@ void usage() {
"\t Otherwise, it will be fp32 precision. Works for float32 model for HTP backend. Defaults to '1' (with FP16 precision.). \n"
"\t [QNN only] [offload_graph_io_quantization]: Offload graph input quantization and graph output dequantization to another EP (typically CPU EP). \n"
"\t Defaults to '0' (QNN EP handles the graph I/O quantization and dequantization). \n"
"\t [QNN only] [extended_udma]: Enable HTP extended UDMA mode for better performance on supported hardware, options: \n"
"\t '0' (disabled), '1' (enabled). Default: '0'. \n"
"\t [Usage]: -e <provider_name> -i '<key1>|<value1> <key2>|<value2>' \n\n"
"\t [Example] [For QNN EP] -e qnn -i \"profiling_level|detailed backend_type|cpu\" \n\n"
"\t [SNPE only] [runtime]: SNPE runtime, options: 'CPU', 'GPU', 'GPU_FLOAT16', 'DSP', 'AIP_FIXED_TF'. \n"
Expand Down Expand Up @@ -615,7 +617,7 @@ int real_main(int argc, char* argv[], Ort::Env& env) {
std::string str = str_stream.str();
ORT_THROW("Wrong value for htp_arch. select from: " + str);
}
} else if (key == "enable_htp_fp16_precision" || key == "offload_graph_io_quantization") {
} else if (key == "enable_htp_fp16_precision" || key == "offload_graph_io_quantization" || key == "extended_udma") {
std::unordered_set<std::string> supported_options = {"0", "1"};
if (supported_options.find(value) == supported_options.end()) {
std::ostringstream str_stream;
Expand All @@ -629,7 +631,7 @@ int real_main(int argc, char* argv[], Ort::Env& env) {
"Wrong key type entered. Choose from options: ['backend_type', 'backend_path', "
"'profiling_level', 'profiling_file_path', 'rpc_control_latency', 'vtcm_mb', 'htp_performance_mode', "
"'qnn_saver_path', 'htp_graph_finalization_optimization_mode', 'op_packages', 'qnn_context_priority', "
"'soc_model', 'htp_arch', 'device_id', 'enable_htp_fp16_precision', 'offload_graph_io_quantization']");
"'soc_model', 'htp_arch', 'device_id', 'enable_htp_fp16_precision', 'offload_graph_io_quantization', 'extended_udma']");
}

qnn_options[key] = value;
Expand Down
2 changes: 2 additions & 0 deletions onnxruntime/test/perftest/command_args_parser.cc
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,8 @@ ABSL_FLAG(std::string, i, "",
" [QNN only] [enable_htp_spill_fill_buffer]: Enable HTP spill fill buffer, used while generating QNN context binary.\n"
" [QNN only] [enable_htp_shared_memory_allocator]: Enable the QNN HTP shared memory allocator and use it for inputs and outputs. Requires libcdsprpc.so/dll to be available.\n"
" Defaults to '0' (disabled).\n"
" [QNN only] [extended_udma]: Enable HTP extended UDMA mode for better performance on supported hardware, options: \n"
" '0' (disabled), '1' (enabled). Default: '0'. \n"
" [Example] [For QNN EP] -e qnn -i \"backend_type|cpu\" \n"
"\n"
" [TensorRT only] [trt_max_partition_iterations]: Maximum iterations for TensorRT parser to get capability.\n"
Expand Down
5 changes: 3 additions & 2 deletions onnxruntime/test/perftest/ort_test_session.cc
Original file line number Diff line number Diff line change
Expand Up @@ -325,7 +325,7 @@ OnnxRuntimeTestSession::OnnxRuntimeTestSession(Ort::Env& env, std::random_device
"qnn_saver_path", "htp_graph_finalization_optimization_mode", "qnn_context_priority",
"htp_arch", "enable_htp_fp16_precision", "offload_graph_io_quantization",
"enable_htp_spill_fill_buffer", "enable_htp_shared_memory_allocator", "dump_json_qnn_graph",
"json_qnn_graph_dir"});
"json_qnn_graph_dir", "extended_udma"});
for (const auto& provider_option : provider_options) {
const std::string& key = provider_option.first;
const std::string& value = provider_option.second;
Expand Down Expand Up @@ -389,7 +389,8 @@ OnnxRuntimeTestSession::OnnxRuntimeTestSession(Ort::Env& env, std::random_device
key == "offload_graph_io_quantization" ||
key == "enable_htp_spill_fill_buffer" ||
key == "enable_htp_shared_memory_allocator" ||
key == "dump_json_qnn_graph") {
key == "dump_json_qnn_graph" ||
key == "extended_udma") {
std::set<std::string> supported_options = {"0", "1"};
if (supported_options.find(value) == supported_options.end()) {
std::ostringstream str_stream;
Expand Down
38 changes: 38 additions & 0 deletions onnxruntime/test/providers/qnn/qnn_basic_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1313,6 +1313,44 @@ TEST_F(QnnHTPBackendTests, DumpJsonQNNGraph) {
std::filesystem::remove_all(dump_dir);
}

// Test exended UDMA mode on supported hardware (should run successfully)
TEST_F(QnnHTPBackendTests, ExtendedUdmaModeTest) {
std::unique_ptr<ModelAndBuilder> model;
std::vector<float> input_data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
std::vector<int64_t> shape = {1, 3, 2};

CreateModelInMemory(model,
QDQBuildAdd3Tensors<uint8_t>(TestInputDef<float>(shape, false, input_data),
TestInputDef<float>(shape, false, input_data),
TestInputDef<float>(shape, false, input_data)),
"add3.qdq");

SessionOptions session_opts;
session_opts.session_logid = "logger0";

RunOptions run_opts;
run_opts.run_tag = session_opts.session_logid;

InferenceSession session_obj{session_opts, GetEnvironment()};
onnxruntime::ProviderOptions options;

options["backend_type"] = "htp";
options["offload_graph_io_quantization"] = "0";
options["htp_arch"] = "81";
options["extended_udma"] = "1";

auto qnn_ep = QnnExecutionProviderWithOptions(options, &session_opts);
EXPECT_TRUE(session_obj.RegisterExecutionProvider(std::move(qnn_ep)).IsOK());

auto status = session_obj.Load(model->model_data.data(), static_cast<int>(model->model_data.size()));
ASSERT_TRUE(status.IsOK());
status = session_obj.Initialize();
ASSERT_TRUE(status.IsOK());
std::vector<OrtValue> fetches;
status = session_obj.Run(run_opts, model->builder.feeds_, model->builder.output_names_, &fetches);
ASSERT_TRUE(status.IsOK());
}

// Test option for offloading quantization of graph inputs and dequantization of graph outputs to the CPU EP.
TEST_F(QnnHTPBackendTests, EPOffloadsGraphIOQuantDequant) {
// Returns a function that checks that the Q/DQ ops at the graph IO boundary are offloaded to CPU
Expand Down