microsoft · chilo-ms · Jan 15, 2025 · Jan 16, 2025 · Jan 17, 2025 · Jan 22, 2025
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
@@ -2481,17 +2481,16 @@
   std::vector<size_t> nodes_vector(number_of_ort_nodes);
   std::iota(std::begin(nodes_vector), std::end(nodes_vector), 0);
 
-  std::set<std::string> exclude_ops_set;
+  std::set<std::string> exclude_ops_set;  // currently not support to exclude ops
 
   /*
-   * There is a known performance issue with the DDS ops (NonMaxSuppression, NonZero and RoiAlign) in TRT 10.
-   * TRT EP automatically excludes DDS ops from running on TRT.
+   * There is a known performance issue with the DDS ops (NonMaxSuppression, NonZero and RoiAlign) when running TRT EP with TRT 10.
+   * DDS op needs special handling here.
    */
-  if (trt_version_ >= 100000 && trt_version_ < 110000) {
-    exclude_ops_set.insert("NonMaxSuppression");
-    exclude_ops_set.insert("NonZero");
-    exclude_ops_set.insert("RoiAlign");
-    LOGS_DEFAULT(VERBOSE) << "There is a known performance issue with the DDS ops (NonMaxSuppression, NonZero and RoiAlign) in TRT 10. TRT EP automatically excludes DDS ops from running on TRT, if applicable";
+  if (trt_version_ >= 100000) {
+    dds_op_set_.insert("NonMaxSuppression");
+    dds_op_set_.insert("NonZero");
+    dds_op_set_.insert("RoiAlign");
   }
 
   SubGraphCollection_t parser_nodes_vector, supported_nodes_vector;
@@ -2649,6 +2648,10 @@
       }
       LOGS_DEFAULT(INFO) << "[TensorRT EP] Whole graph will run on TensorRT execution provider";
 
+#if NV_TENSORRT_MAJOR >= 10
+      // TRT EP will take appropriate actions later to prevent performance degradation if the graph has DDS op that run by TRT 10.
+      is_dds_op_in_graph_ = IsDDSOpInSubGraph(graph, result, dds_op_set_);
+#endif
       // The context map is only used during EP compile time, release it to save memory space.
       subgraph_context_map_.clear();
       return result;
@@ -2665,6 +2668,11 @@
     }
   }
 
+#if NV_TENSORRT_MAJOR >= 10
+  // TRT EP will take appropriate actions later to prevent performance degradation if the graph has DDS op that run by TRT 10.
+  is_dds_op_in_graph_ = IsDDSOpInSubGraph(graph, result, dds_op_set_);
+#endif
+
   const size_t number_of_subgraphs = supported_nodes_vector.size();
   if (number_of_trt_nodes == 0) {
     LOGS_DEFAULT(WARNING) << "[TensorRT EP] No graph will run on TensorRT execution provider";
@@ -2765,6 +2773,17 @@
 
 common::Status TensorrtExecutionProvider::Compile(const std::vector<FusedNodeAndGraph>& fused_nodes_and_graphs,
                                                   std::vector<NodeComputeInfo>& node_compute_funcs) {
+#if NV_TENSORRT_MAJOR >= 10
+  // There is a known performance issue with the DDS ops (NonMaxSuppression, NonZero and RoiAlign) when running TRT EP with TRT 10.
+  // The issue arises because when cudaStreamSynchronize is called after inference, GPU memory is released back to the OS.
+  // As a result, for the next inference run, TRT reallocates GPU memory from the OS, introducing overhead and leading to performance degradation.
+  // The solution is to increase the memory pool threshold, allowing TRT to retain the allocated memory and reduce this overhead.
+  if (is_dds_op_in_graph_) {
+    trt_gpu_allocator_ = std::make_unique<onnxruntime::PoolAllocator>();
+    runtime_->setGpuAllocator(trt_gpu_allocator_.get());
+  }
+#endif
+
   for (auto& fused_node_graph : fused_nodes_and_graphs) {
     const GraphViewer& graph_body_viewer = fused_node_graph.filtered_graph;
     const Node& fused_node = fused_node_graph.fused_node;

diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h
@@ -114,6 +114,50 @@
 using unique_pointer = std::unique_ptr<T, TensorrtInferDeleter>;
 };  // namespace tensorrt_ptr
 
+#if NV_TENSORRT_MAJOR >= 10
+/*
+ * Application-implemented class for controlling asynchronous (stream ordered) memory allocation on the GPU.
+ *
+ */
+class PoolAllocator : public nvinfer1::IGpuAsyncAllocator {
+ public:
+  PoolAllocator() {
+    cudaMemPoolProps poolProps{};
+    poolProps.allocType = ::cudaMemAllocationTypePinned;
+    poolProps.handleTypes = ::cudaMemHandleTypeNone;
+    poolProps.location.type = ::cudaMemLocationTypeDevice;
+    poolProps.location.id = 0;
+    cudaMemPoolCreate(&mPool, &poolProps);
+    auto maxThreshold = std::numeric_limits<std::uint64_t>::max();
+    // cudaMemPoolAttrReleaseThreshold:
+    // Amount of reserved memory in bytes to hold onto before trying to release memory back to the OS.
+    // When more than the release threshold bytes of memory are held by the memory pool, the allocator
+    // will try to release memory back to the OS on the next call to stream, event or context synchronize
+    cudaMemPoolSetAttribute(mPool, cudaMemPoolAttrReleaseThreshold, &maxThreshold);
+  }
+
+  void* allocateAsync(uint64_t const size, uint64_t const alignment, nvinfer1::AllocatorFlags const flags,
+                      cudaStream_t stream) noexcept override {
+    void* memory{nullptr};
+    cudaMallocFromPoolAsync(&memory, size, mPool, stream);
+    return memory;
+  }
+  bool deallocateAsync(void* const memory, cudaStream_t stream) noexcept override {
+    cudaFreeAsync(memory, stream);
+    return true;
+  }
+
+  ~PoolAllocator() {
+    if (mPool) {
+      cudaMemPoolDestroy(mPool);
+    }
+  }
+
+ private:
+  cudaMemPool_t mPool{nullptr};
+};
+#endif
+
 //
 // Class to allocate memory for outputs with data-dependent shapes. The sizes of those are unknown so pre-allocation is
 // not possible.
@@ -312,6 +356,7 @@
   std::string tactic_sources_;
   std::string global_cache_path_, cache_path_, engine_decryption_lib_path_;
   std::unique_ptr<nvinfer1::IRuntime> runtime_ = nullptr;
+  std::unique_ptr<PoolAllocator> trt_gpu_allocator_ = nullptr;
   std::mutex tensorrt_mu_;
   int device_id_;
   std::string compute_capability_;
@@ -351,6 +396,9 @@
   std::unordered_set<std::string> control_flow_op_set_ = {"If", "Loop", "Scan"};
   mutable std::unordered_map<std::string, std::unique_ptr<SubGraphContext>> subgraph_context_map_;
 
+  mutable std::unordered_set<std::string> dds_op_set_;
+  mutable bool is_dds_op_in_graph_ = false;
+
   mutable std::unique_ptr<nvinfer1::IBuilder> builder_;
 
   // Following maps that hold TRT objects will be accessible by different threads if ORT is using multithreading.
@@ -590,5 +638,12 @@
    * This function only creates the instance at the first time it's being called."
    */
   nvinfer1::IBuilder* GetBuilder(TensorrtLogger& trt_logger) const;
+
+  /**
+   * Check if DDS op is in the ComputeCapability/subgraph.
+   */
+  bool IsDDSOpInSubGraph(const GraphViewer& graph,
+                         std::vector<std::unique_ptr<ComputeCapability>>& compute_capabilities,
+                         std::unordered_set<std::string>& dds_op_set) const;
 };
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_helper.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_helper.cc
@@ -258,4 +258,22 @@
 
   graph.SetInputs(graph_inputs_including_initializers);
 }
+
+// Check if DDS op is in the ComputeCapability/subgraph.
+bool TensorrtExecutionProvider::IsDDSOpInSubGraph(const GraphViewer& graph,
+                                                  std::vector<std::unique_ptr<ComputeCapability>>& compute_capabilities,
+                                                  std::unordered_set<std::string>& dds_op_set) const {
+  auto is_dds_op = [&](const auto& node) {
+    if (dds_op_set.find(node->OpType()) != dds_op_set.end()) return true;
+    return false;
+  };
+
+  for (auto& compute_capability : compute_capabilities) {
+    auto& indexed_sub_graph = compute_capability->SubGraph();
+    for (auto i : indexed_sub_graph->Nodes()) {
+      if (is_dds_op(graph.GetNode(i))) return true;
+    }
+  }
+  return false;
+}
 }  // namespace onnxruntime