From 8e9dd629516f7d571770a8ecc1a0afc4a8e18de3 Mon Sep 17 00:00:00 2001 From: qti-monumeen Date: Fri, 28 Nov 2025 15:24:43 +0530 Subject: [PATCH] [QNN EP] Enablement of 64bit Udma mode --- .../qnn/builder/qnn_backend_manager.cc | 15 ++++++-- .../qnn/builder/qnn_backend_manager.h | 5 ++- .../providers/qnn/qnn_execution_provider.cc | 16 +++++++- .../providers/qnn/qnn_execution_provider.h | 1 + .../command_args_parser.cc | 6 ++- onnxruntime/test/onnx/main.cc | 6 ++- .../test/perftest/command_args_parser.cc | 2 + onnxruntime/test/perftest/ort_test_session.cc | 5 ++- .../test/providers/qnn/qnn_basic_test.cc | 38 +++++++++++++++++++ 9 files changed, 82 insertions(+), 12 deletions(-) diff --git a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc index ec86ce4c84670..865ced26007d3 100644 --- a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc +++ b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc @@ -894,7 +894,7 @@ Status QnnBackendManager::ResetContextPriority() { return SetContextPriority(context_priority_); } -Status QnnBackendManager::CreateContext(bool enable_htp_weight_sharing) { +Status QnnBackendManager::CreateContext(bool enable_htp_weight_sharing, bool enable_htp_extended_udma_mode) { if (true == context_created_) { LOGS_DEFAULT(INFO) << "Context created already."; return Status::OK(); @@ -910,8 +910,16 @@ Status QnnBackendManager::CreateContext(bool enable_htp_weight_sharing) { QnnContext_Config_t context_priority_config = QNN_CONTEXT_CONFIG_INIT; ORT_RETURN_IF_ERROR(SetQnnContextConfig(context_priority_, context_priority_config)); + QnnContext_Config_t context_config_extended_udma = QNN_CONTEXT_CONFIG_INIT; + QnnHtpContext_CustomConfig_t udma_custom_config; + udma_custom_config.option = QNN_HTP_CONTEXT_CONFIG_OPTION_USE_EXTENDED_UDMA; + udma_custom_config.useExtendedUdma = enable_htp_extended_udma_mode; + context_config_extended_udma.option = QNN_CONTEXT_CONFIG_OPTION_CUSTOM; + context_config_extended_udma.customConfig = &udma_custom_config; + const QnnContext_Config_t* npu_context_configs[] = {&context_priority_config, &context_config_weight_sharing, + &context_config_extended_udma, nullptr}; const QnnContext_Config_t* empty_context_configs[] = {nullptr}; @@ -1225,7 +1233,8 @@ Status QnnBackendManager::SetupBackend(const logging::Logger& logger, bool need_load_system_lib, bool share_ep_contexts, bool enable_vtcm_backup_buffer_sharing, - std::unordered_map>>& context_bin_map) { + std::unordered_map>>& context_bin_map, + bool enable_htp_extended_udma_mode) { std::lock_guard lock(logger_recursive_mutex_); if (backend_setup_completed_) { LOGS(logger, VERBOSE) << "Backend setup already!"; @@ -1322,7 +1331,7 @@ Status QnnBackendManager::SetupBackend(const logging::Logger& logger, if (status.IsOK() && (vtcm_backup_buffer_sharing_enabled_ || !load_from_cached_context)) { status = vtcm_backup_buffer_sharing_enabled_ ? CreateContextVtcmBackupBufferSharingEnabled(context_bin_map) - : CreateContext(enable_htp_weight_sharing); + : CreateContext(enable_htp_weight_sharing, enable_htp_extended_udma_mode); if (status.IsOK()) { LOGS(logger, VERBOSE) << "CreateContext succeed."; diff --git a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h index 75ab01013bdfd..47321dca33b10 100644 --- a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h +++ b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h @@ -161,7 +161,8 @@ class QnnBackendManager : public std::enable_shared_from_this Status SetupBackend(const logging::Logger& logger, bool load_from_cached_context, bool need_load_system_lib, bool share_ep_contexts, bool enable_vtcm_backup_buffer_sharing, - std::unordered_map>>& context_bin_map); + std::unordered_map>>& context_bin_map, + bool enable_htp_extended_udma_mode); Status CreateHtpPowerCfgId(uint32_t deviceId, uint32_t coreId, uint32_t& htp_power_config_id); @@ -254,7 +255,7 @@ class QnnBackendManager : public std::enable_shared_from_this Status ReleaseProfilehandle(); - Status CreateContext(bool enable_htp_weight_sharing); + Status CreateContext(bool enable_htp_weight_sharing, bool enable_htp_extended_udma_mode); Status CreateContextVtcmBackupBufferSharingEnabled(std::unordered_map>>& context_bin_map); diff --git a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc index fcf25a04b656a..625989f0564ad 100644 --- a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc +++ b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc @@ -566,6 +566,19 @@ QNNExecutionProvider::QNNExecutionProvider(const ProviderOptions& provider_optio } } + static const std::string QNN_HTP_EXTENDED_UDMA_MODE = "extended_udma"; + auto htp_extended_udma_pos = provider_options_map.find(QNN_HTP_EXTENDED_UDMA_MODE); + if (htp_extended_udma_pos != provider_options_map.end()) { + if ("1" == htp_extended_udma_pos->second) { + enable_htp_extended_udma_mode_ = true; + } else if ("0" == htp_extended_udma_pos->second) { + enable_htp_extended_udma_mode_ = false; + } else { + LOGS_DEFAULT(WARNING) << "Invalid enable_htp_extended_udma_mode_ " << enable_htp_extended_udma_mode_ << " only 0 or 1 allowed. Set to 0."; + } + LOGS_DEFAULT(VERBOSE) << "User specified enable_htp_extended_udma_mode_: " << enable_htp_extended_udma_mode_; + } + // Option to skip QNN API interface version check to use other QNN library other than default. static const std::string SKIP_QNN_VERSION_CHECK = "skip_qnn_version_check"; auto skip_qnn_version_check = ParseBoolOption(SKIP_QNN_VERSION_CHECK, false, provider_options_map); @@ -948,7 +961,8 @@ QNNExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_viewer context_cache_enabled_ && enable_spill_fill_buffer_, share_ep_contexts_, enable_vtcm_backup_buffer_sharing_, - context_bin_map); + context_bin_map, + enable_htp_extended_udma_mode_); context_bin_map.clear(); diff --git a/onnxruntime/core/providers/qnn/qnn_execution_provider.h b/onnxruntime/core/providers/qnn/qnn_execution_provider.h index 6adf613932d66..26e9d3871777f 100644 --- a/onnxruntime/core/providers/qnn/qnn_execution_provider.h +++ b/onnxruntime/core/providers/qnn/qnn_execution_provider.h @@ -110,6 +110,7 @@ class QNNExecutionProvider : public IExecutionProvider { qnn::ModelSettings model_settings_ = {}; bool dump_json_qnn_graph_ = false; std::string json_qnn_graph_dir_ = ""; + bool enable_htp_extended_udma_mode_ = false; // Whether this is set depends on a session option enabling it and if the RPCMEM dynamic library is available. // This is potentially shared with HtpSharedMemoryAllocator which may be returned by CreatePreferredAllocators(). diff --git a/onnxruntime/test/ep_weight_sharing_ctx_gen/command_args_parser.cc b/onnxruntime/test/ep_weight_sharing_ctx_gen/command_args_parser.cc index cecf5575d42a5..52678add71136 100644 --- a/onnxruntime/test/ep_weight_sharing_ctx_gen/command_args_parser.cc +++ b/onnxruntime/test/ep_weight_sharing_ctx_gen/command_args_parser.cc @@ -54,6 +54,8 @@ namespace qnnctxgen { "\t [QNN only] [offload_graph_io_quantization]: Offload graph input quantization and graph output dequantization to another EP (typically CPU EP). \n" "\t Defaults to '1' (another EP (typically CPU EP) handles the graph I/O quantization and dequantization). \n" "\t [QNN only] [enable_htp_spill_fill_buffer]: Enable HTP spill file buffer, used while generating QNN context binary.\n" + "\t [QNN only] [extended_udma]: Enable HTP extended UDMA mode for better performance on supported hardware, options: \n" + "\t '0' (disabled), '1' (enabled). Default: '0'. \n" "\t [Example] -i \"vtcm_mb|8 htp_arch|73\" \n" "\n" "\t-h: help\n"); @@ -165,7 +167,7 @@ static bool ParseSessionConfigs(const std::string& configs_string, ORT_THROW("Wrong value for htp_graph_finalization_optimization_mode. select from: " + str); } } else if (key == "enable_htp_fp16_precision" || key == "offload_graph_io_quantization" || - key == "enable_htp_spill_fill_buffer") { + key == "enable_htp_spill_fill_buffer" || key == "extended_udma") { std::unordered_set supported_options = {"0", "1"}; if (supported_options.find(value) == supported_options.end()) { std::ostringstream str_stream; @@ -178,7 +180,7 @@ static bool ParseSessionConfigs(const std::string& configs_string, ORT_THROW( "Wrong key type entered. Choose from options: ['backend_type', 'backend_path', 'vtcm_mb', " "'htp_performance_mode', 'htp_graph_finalization_optimization_mode', 'soc_model', 'htp_arch', " - "'enable_htp_fp16_precision', 'offload_graph_io_quantization', 'enable_htp_spill_fill_buffer']"); + "'enable_htp_fp16_precision', 'offload_graph_io_quantization', 'enable_htp_spill_fill_buffer', 'extended_udma']"); } test_config.run_config.provider_options[key] = value; diff --git a/onnxruntime/test/onnx/main.cc b/onnxruntime/test/onnx/main.cc index 463634b370d4c..601ce73689cda 100644 --- a/onnxruntime/test/onnx/main.cc +++ b/onnxruntime/test/onnx/main.cc @@ -90,6 +90,8 @@ void usage() { "\t Otherwise, it will be fp32 precision. Works for float32 model for HTP backend. Defaults to '1' (with FP16 precision.). \n" "\t [QNN only] [offload_graph_io_quantization]: Offload graph input quantization and graph output dequantization to another EP (typically CPU EP). \n" "\t Defaults to '0' (QNN EP handles the graph I/O quantization and dequantization). \n" + "\t [QNN only] [extended_udma]: Enable HTP extended UDMA mode for better performance on supported hardware, options: \n" + "\t '0' (disabled), '1' (enabled). Default: '0'. \n" "\t [Usage]: -e -i '| |' \n\n" "\t [Example] [For QNN EP] -e qnn -i \"profiling_level|detailed backend_type|cpu\" \n\n" "\t [SNPE only] [runtime]: SNPE runtime, options: 'CPU', 'GPU', 'GPU_FLOAT16', 'DSP', 'AIP_FIXED_TF'. \n" @@ -615,7 +617,7 @@ int real_main(int argc, char* argv[], Ort::Env& env) { std::string str = str_stream.str(); ORT_THROW("Wrong value for htp_arch. select from: " + str); } - } else if (key == "enable_htp_fp16_precision" || key == "offload_graph_io_quantization") { + } else if (key == "enable_htp_fp16_precision" || key == "offload_graph_io_quantization" || key == "extended_udma") { std::unordered_set supported_options = {"0", "1"}; if (supported_options.find(value) == supported_options.end()) { std::ostringstream str_stream; @@ -629,7 +631,7 @@ int real_main(int argc, char* argv[], Ort::Env& env) { "Wrong key type entered. Choose from options: ['backend_type', 'backend_path', " "'profiling_level', 'profiling_file_path', 'rpc_control_latency', 'vtcm_mb', 'htp_performance_mode', " "'qnn_saver_path', 'htp_graph_finalization_optimization_mode', 'op_packages', 'qnn_context_priority', " - "'soc_model', 'htp_arch', 'device_id', 'enable_htp_fp16_precision', 'offload_graph_io_quantization']"); + "'soc_model', 'htp_arch', 'device_id', 'enable_htp_fp16_precision', 'offload_graph_io_quantization', 'extended_udma']"); } qnn_options[key] = value; diff --git a/onnxruntime/test/perftest/command_args_parser.cc b/onnxruntime/test/perftest/command_args_parser.cc index 8960898f036fc..a3edb9c15a927 100644 --- a/onnxruntime/test/perftest/command_args_parser.cc +++ b/onnxruntime/test/perftest/command_args_parser.cc @@ -98,6 +98,8 @@ ABSL_FLAG(std::string, i, "", " [QNN only] [enable_htp_spill_fill_buffer]: Enable HTP spill fill buffer, used while generating QNN context binary.\n" " [QNN only] [enable_htp_shared_memory_allocator]: Enable the QNN HTP shared memory allocator and use it for inputs and outputs. Requires libcdsprpc.so/dll to be available.\n" " Defaults to '0' (disabled).\n" + " [QNN only] [extended_udma]: Enable HTP extended UDMA mode for better performance on supported hardware, options: \n" + " '0' (disabled), '1' (enabled). Default: '0'. \n" " [Example] [For QNN EP] -e qnn -i \"backend_type|cpu\" \n" "\n" " [TensorRT only] [trt_max_partition_iterations]: Maximum iterations for TensorRT parser to get capability.\n" diff --git a/onnxruntime/test/perftest/ort_test_session.cc b/onnxruntime/test/perftest/ort_test_session.cc index 0f2da07c69d85..cffe9a55138f2 100644 --- a/onnxruntime/test/perftest/ort_test_session.cc +++ b/onnxruntime/test/perftest/ort_test_session.cc @@ -325,7 +325,7 @@ OnnxRuntimeTestSession::OnnxRuntimeTestSession(Ort::Env& env, std::random_device "qnn_saver_path", "htp_graph_finalization_optimization_mode", "qnn_context_priority", "htp_arch", "enable_htp_fp16_precision", "offload_graph_io_quantization", "enable_htp_spill_fill_buffer", "enable_htp_shared_memory_allocator", "dump_json_qnn_graph", - "json_qnn_graph_dir"}); + "json_qnn_graph_dir", "extended_udma"}); for (const auto& provider_option : provider_options) { const std::string& key = provider_option.first; const std::string& value = provider_option.second; @@ -389,7 +389,8 @@ OnnxRuntimeTestSession::OnnxRuntimeTestSession(Ort::Env& env, std::random_device key == "offload_graph_io_quantization" || key == "enable_htp_spill_fill_buffer" || key == "enable_htp_shared_memory_allocator" || - key == "dump_json_qnn_graph") { + key == "dump_json_qnn_graph" || + key == "extended_udma") { std::set supported_options = {"0", "1"}; if (supported_options.find(value) == supported_options.end()) { std::ostringstream str_stream; diff --git a/onnxruntime/test/providers/qnn/qnn_basic_test.cc b/onnxruntime/test/providers/qnn/qnn_basic_test.cc index 87ca6e32c82f9..c88927c501159 100644 --- a/onnxruntime/test/providers/qnn/qnn_basic_test.cc +++ b/onnxruntime/test/providers/qnn/qnn_basic_test.cc @@ -1313,6 +1313,44 @@ TEST_F(QnnHTPBackendTests, DumpJsonQNNGraph) { std::filesystem::remove_all(dump_dir); } +// Test exended UDMA mode on supported hardware (should run successfully) +TEST_F(QnnHTPBackendTests, ExtendedUdmaModeTest) { + std::unique_ptr model; + std::vector input_data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}; + std::vector shape = {1, 3, 2}; + + CreateModelInMemory(model, + QDQBuildAdd3Tensors(TestInputDef(shape, false, input_data), + TestInputDef(shape, false, input_data), + TestInputDef(shape, false, input_data)), + "add3.qdq"); + + SessionOptions session_opts; + session_opts.session_logid = "logger0"; + + RunOptions run_opts; + run_opts.run_tag = session_opts.session_logid; + + InferenceSession session_obj{session_opts, GetEnvironment()}; + onnxruntime::ProviderOptions options; + + options["backend_type"] = "htp"; + options["offload_graph_io_quantization"] = "0"; + options["htp_arch"] = "81"; + options["extended_udma"] = "1"; + + auto qnn_ep = QnnExecutionProviderWithOptions(options, &session_opts); + EXPECT_TRUE(session_obj.RegisterExecutionProvider(std::move(qnn_ep)).IsOK()); + + auto status = session_obj.Load(model->model_data.data(), static_cast(model->model_data.size())); + ASSERT_TRUE(status.IsOK()); + status = session_obj.Initialize(); + ASSERT_TRUE(status.IsOK()); + std::vector fetches; + status = session_obj.Run(run_opts, model->builder.feeds_, model->builder.output_names_, &fetches); + ASSERT_TRUE(status.IsOK()); +} + // Test option for offloading quantization of graph inputs and dequantization of graph outputs to the CPU EP. TEST_F(QnnHTPBackendTests, EPOffloadsGraphIOQuantDequant) { // Returns a function that checks that the Q/DQ ops at the graph IO boundary are offloaded to CPU