diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc index c583598bbcc52..f62ee7ab78bf2 100644 --- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc +++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc @@ -2481,17 +2481,16 @@ TensorrtExecutionProvider::GetCapability(const GraphViewer& graph, std::vector nodes_vector(number_of_ort_nodes); std::iota(std::begin(nodes_vector), std::end(nodes_vector), 0); - std::set exclude_ops_set; + std::set exclude_ops_set; // currently not support to exclude ops /* - * There is a known performance issue with the DDS ops (NonMaxSuppression, NonZero and RoiAlign) in TRT 10. - * TRT EP automatically excludes DDS ops from running on TRT. + * There is a known performance issue with the DDS ops (NonMaxSuppression, NonZero and RoiAlign) when running TRT EP with TRT 10. + * DDS op needs special handling here. */ - if (trt_version_ >= 100000 && trt_version_ < 110000) { - exclude_ops_set.insert("NonMaxSuppression"); - exclude_ops_set.insert("NonZero"); - exclude_ops_set.insert("RoiAlign"); - LOGS_DEFAULT(VERBOSE) << "There is a known performance issue with the DDS ops (NonMaxSuppression, NonZero and RoiAlign) in TRT 10. TRT EP automatically excludes DDS ops from running on TRT, if applicable"; + if (trt_version_ >= 100000) { + dds_op_set_.insert("NonMaxSuppression"); + dds_op_set_.insert("NonZero"); + dds_op_set_.insert("RoiAlign"); } SubGraphCollection_t parser_nodes_vector, supported_nodes_vector; @@ -2649,6 +2648,10 @@ TensorrtExecutionProvider::GetCapability(const GraphViewer& graph, } LOGS_DEFAULT(INFO) << "[TensorRT EP] Whole graph will run on TensorRT execution provider"; +#if NV_TENSORRT_MAJOR >= 10 + // TRT EP will take appropriate actions later to prevent performance degradation if the graph has DDS op that run by TRT 10. + is_dds_op_in_graph_ = IsDDSOpInSubGraph(graph, result, dds_op_set_); +#endif // The context map is only used during EP compile time, release it to save memory space. subgraph_context_map_.clear(); return result; @@ -2665,6 +2668,11 @@ TensorrtExecutionProvider::GetCapability(const GraphViewer& graph, } } +#if NV_TENSORRT_MAJOR >= 10 + // TRT EP will take appropriate actions later to prevent performance degradation if the graph has DDS op that run by TRT 10. + is_dds_op_in_graph_ = IsDDSOpInSubGraph(graph, result, dds_op_set_); +#endif + const size_t number_of_subgraphs = supported_nodes_vector.size(); if (number_of_trt_nodes == 0) { LOGS_DEFAULT(WARNING) << "[TensorRT EP] No graph will run on TensorRT execution provider"; @@ -2765,6 +2773,17 @@ common::Status TensorrtExecutionProvider::RefitEngine(std::string onnx_model_fil common::Status TensorrtExecutionProvider::Compile(const std::vector& fused_nodes_and_graphs, std::vector& node_compute_funcs) { +#if NV_TENSORRT_MAJOR >= 10 + // There is a known performance issue with the DDS ops (NonMaxSuppression, NonZero and RoiAlign) when running TRT EP with TRT 10. + // The issue arises because when cudaStreamSynchronize is called after inference, GPU memory is released back to the OS. + // As a result, for the next inference run, TRT reallocates GPU memory from the OS, introducing overhead and leading to performance degradation. + // The solution is to increase the memory pool threshold, allowing TRT to retain the allocated memory and reduce this overhead. + if (is_dds_op_in_graph_) { + trt_gpu_allocator_ = std::make_unique(); + runtime_->setGpuAllocator(trt_gpu_allocator_.get()); + } +#endif + for (auto& fused_node_graph : fused_nodes_and_graphs) { const GraphViewer& graph_body_viewer = fused_node_graph.filtered_graph; const Node& fused_node = fused_node_graph.fused_node; diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h index d3e0b0fba8891..5bf1bc35e9d92 100644 --- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h +++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h @@ -114,6 +114,50 @@ template using unique_pointer = std::unique_ptr; }; // namespace tensorrt_ptr +#if NV_TENSORRT_MAJOR >= 10 +/* + * Application-implemented class for controlling asynchronous (stream ordered) memory allocation on the GPU. + * + */ +class PoolAllocator : public nvinfer1::IGpuAsyncAllocator { + public: + PoolAllocator() { + cudaMemPoolProps poolProps{}; + poolProps.allocType = ::cudaMemAllocationTypePinned; + poolProps.handleTypes = ::cudaMemHandleTypeNone; + poolProps.location.type = ::cudaMemLocationTypeDevice; + poolProps.location.id = 0; + cudaMemPoolCreate(&mPool, &poolProps); + auto maxThreshold = std::numeric_limits::max(); + // cudaMemPoolAttrReleaseThreshold: + // Amount of reserved memory in bytes to hold onto before trying to release memory back to the OS. + // When more than the release threshold bytes of memory are held by the memory pool, the allocator + // will try to release memory back to the OS on the next call to stream, event or context synchronize + cudaMemPoolSetAttribute(mPool, cudaMemPoolAttrReleaseThreshold, &maxThreshold); + } + + void* allocateAsync(uint64_t const size, uint64_t const alignment, nvinfer1::AllocatorFlags const flags, + cudaStream_t stream) noexcept override { + void* memory{nullptr}; + cudaMallocFromPoolAsync(&memory, size, mPool, stream); + return memory; + } + bool deallocateAsync(void* const memory, cudaStream_t stream) noexcept override { + cudaFreeAsync(memory, stream); + return true; + } + + ~PoolAllocator() { + if (mPool) { + cudaMemPoolDestroy(mPool); + } + } + + private: + cudaMemPool_t mPool{nullptr}; +}; +#endif + // // Class to allocate memory for outputs with data-dependent shapes. The sizes of those are unknown so pre-allocation is // not possible. @@ -312,6 +356,7 @@ class TensorrtExecutionProvider : public IExecutionProvider { std::string tactic_sources_; std::string global_cache_path_, cache_path_, engine_decryption_lib_path_; std::unique_ptr runtime_ = nullptr; + std::unique_ptr trt_gpu_allocator_ = nullptr; std::mutex tensorrt_mu_; int device_id_; std::string compute_capability_; @@ -351,6 +396,9 @@ class TensorrtExecutionProvider : public IExecutionProvider { std::unordered_set control_flow_op_set_ = {"If", "Loop", "Scan"}; mutable std::unordered_map> subgraph_context_map_; + mutable std::unordered_set dds_op_set_; + mutable bool is_dds_op_in_graph_ = false; + mutable std::unique_ptr builder_; // Following maps that hold TRT objects will be accessible by different threads if ORT is using multithreading. @@ -590,5 +638,12 @@ class TensorrtExecutionProvider : public IExecutionProvider { * This function only creates the instance at the first time it's being called." */ nvinfer1::IBuilder* GetBuilder(TensorrtLogger& trt_logger) const; + + /** + * Check if DDS op is in the ComputeCapability/subgraph. + */ + bool IsDDSOpInSubGraph(const GraphViewer& graph, + std::vector>& compute_capabilities, + std::unordered_set& dds_op_set) const; }; } // namespace onnxruntime diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_helper.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_helper.cc index 92fa101118506..104594ce4d424 100644 --- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_helper.cc +++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_helper.cc @@ -258,4 +258,22 @@ void TensorrtExecutionProvider::SetAllGraphInputs(Graph& graph) const { graph.SetInputs(graph_inputs_including_initializers); } + +// Check if DDS op is in the ComputeCapability/subgraph. +bool TensorrtExecutionProvider::IsDDSOpInSubGraph(const GraphViewer& graph, + std::vector>& compute_capabilities, + std::unordered_set& dds_op_set) const { + auto is_dds_op = [&](const auto& node) { + if (dds_op_set.find(node->OpType()) != dds_op_set.end()) return true; + return false; + }; + + for (auto& compute_capability : compute_capabilities) { + auto& indexed_sub_graph = compute_capability->SubGraph(); + for (auto i : indexed_sub_graph->Nodes()) { + if (is_dds_op(graph.GetNode(i))) return true; + } + } + return false; +} } // namespace onnxruntime