Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[TensorRT] Fix perf issue for DDS nodes run by TRT 10 #23424

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 27 additions & 8 deletions onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2481,17 +2481,16 @@
std::vector<size_t> nodes_vector(number_of_ort_nodes);
std::iota(std::begin(nodes_vector), std::end(nodes_vector), 0);

std::set<std::string> exclude_ops_set;
std::set<std::string> exclude_ops_set; // currently not support to exclude ops

Check warning on line 2484 in onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc

View workflow job for this annotation

GitHub Actions / Optional Lint C++

[cpplint] reported by reviewdog 🐶 Add #include <set> for set<> [build/include_what_you_use] [4] Raw Output: onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc:2484: Add #include <set> for set<> [build/include_what_you_use] [4]

/*
* There is a known performance issue with the DDS ops (NonMaxSuppression, NonZero and RoiAlign) in TRT 10.
* TRT EP automatically excludes DDS ops from running on TRT.
* There is a known performance issue with the DDS ops (NonMaxSuppression, NonZero and RoiAlign) when running TRT EP with TRT 10.
* DDS op needs special handling here.
*/
if (trt_version_ >= 100000 && trt_version_ < 110000) {
exclude_ops_set.insert("NonMaxSuppression");
exclude_ops_set.insert("NonZero");
exclude_ops_set.insert("RoiAlign");
LOGS_DEFAULT(VERBOSE) << "There is a known performance issue with the DDS ops (NonMaxSuppression, NonZero and RoiAlign) in TRT 10. TRT EP automatically excludes DDS ops from running on TRT, if applicable";
if (trt_version_ >= 100000) {
dds_op_set_.insert("NonMaxSuppression");
dds_op_set_.insert("NonZero");
dds_op_set_.insert("RoiAlign");
}

SubGraphCollection_t parser_nodes_vector, supported_nodes_vector;
Expand Down Expand Up @@ -2649,6 +2648,10 @@
}
LOGS_DEFAULT(INFO) << "[TensorRT EP] Whole graph will run on TensorRT execution provider";

#if NV_TENSORRT_MAJOR >= 10
// TRT EP will take appropriate actions later to prevent performance degradation if the graph has DDS op that run by TRT 10.
is_dds_op_in_graph_ = IsDDSOpInSubGraph(graph, result, dds_op_set_);
#endif
// The context map is only used during EP compile time, release it to save memory space.
subgraph_context_map_.clear();
return result;
Expand All @@ -2665,6 +2668,11 @@
}
}

#if NV_TENSORRT_MAJOR >= 10
// TRT EP will take appropriate actions later to prevent performance degradation if the graph has DDS op that run by TRT 10.
is_dds_op_in_graph_ = IsDDSOpInSubGraph(graph, result, dds_op_set_);
#endif

const size_t number_of_subgraphs = supported_nodes_vector.size();
if (number_of_trt_nodes == 0) {
LOGS_DEFAULT(WARNING) << "[TensorRT EP] No graph will run on TensorRT execution provider";
Expand Down Expand Up @@ -2765,6 +2773,17 @@

common::Status TensorrtExecutionProvider::Compile(const std::vector<FusedNodeAndGraph>& fused_nodes_and_graphs,
std::vector<NodeComputeInfo>& node_compute_funcs) {
#if NV_TENSORRT_MAJOR >= 10
// There is a known performance issue with the DDS ops (NonMaxSuppression, NonZero and RoiAlign) when running TRT EP with TRT 10.
// The issue arises because when cudaStreamSynchronize is called after inference, GPU memory is released back to the OS.
// As a result, for the next inference run, TRT reallocates GPU memory from the OS, introducing overhead and leading to performance degradation.
// The solution is to increase the memory pool threshold, allowing TRT to retain the allocated memory and reduce this overhead.
if (is_dds_op_in_graph_) {
trt_gpu_allocator_ = std::make_unique<onnxruntime::PoolAllocator>();
runtime_->setGpuAllocator(trt_gpu_allocator_.get());
}
#endif

for (auto& fused_node_graph : fused_nodes_and_graphs) {
const GraphViewer& graph_body_viewer = fused_node_graph.filtered_graph;
const Node& fused_node = fused_node_graph.fused_node;
Expand Down
55 changes: 55 additions & 0 deletions onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,50 @@
using unique_pointer = std::unique_ptr<T, TensorrtInferDeleter>;
}; // namespace tensorrt_ptr

#if NV_TENSORRT_MAJOR >= 10
/*
* Application-implemented class for controlling asynchronous (stream ordered) memory allocation on the GPU.
*
*/
class PoolAllocator : public nvinfer1::IGpuAsyncAllocator {
public:
PoolAllocator() {
cudaMemPoolProps poolProps{};
poolProps.allocType = ::cudaMemAllocationTypePinned;
poolProps.handleTypes = ::cudaMemHandleTypeNone;
poolProps.location.type = ::cudaMemLocationTypeDevice;
poolProps.location.id = 0;
cudaMemPoolCreate(&mPool, &poolProps);
auto maxThreshold = std::numeric_limits<std::uint64_t>::max();

Check warning on line 131 in onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h

View workflow job for this annotation

GitHub Actions / Optional Lint C++

[cpplint] reported by reviewdog 🐶 Add #include <limits> for numeric_limits<> [build/include_what_you_use] [4] Raw Output: onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h:131: Add #include <limits> for numeric_limits<> [build/include_what_you_use] [4]
// cudaMemPoolAttrReleaseThreshold:
// Amount of reserved memory in bytes to hold onto before trying to release memory back to the OS.
// When more than the release threshold bytes of memory are held by the memory pool, the allocator
// will try to release memory back to the OS on the next call to stream, event or context synchronize
cudaMemPoolSetAttribute(mPool, cudaMemPoolAttrReleaseThreshold, &maxThreshold);
}

void* allocateAsync(uint64_t const size, uint64_t const alignment, nvinfer1::AllocatorFlags const flags,
cudaStream_t stream) noexcept override {
void* memory{nullptr};
cudaMallocFromPoolAsync(&memory, size, mPool, stream);
return memory;
}
bool deallocateAsync(void* const memory, cudaStream_t stream) noexcept override {
cudaFreeAsync(memory, stream);
return true;
}

~PoolAllocator() {
if (mPool) {
cudaMemPoolDestroy(mPool);
}
}

private:
cudaMemPool_t mPool{nullptr};
};
#endif

//
// Class to allocate memory for outputs with data-dependent shapes. The sizes of those are unknown so pre-allocation is
// not possible.
Expand Down Expand Up @@ -312,6 +356,7 @@
std::string tactic_sources_;
std::string global_cache_path_, cache_path_, engine_decryption_lib_path_;
std::unique_ptr<nvinfer1::IRuntime> runtime_ = nullptr;
std::unique_ptr<PoolAllocator> trt_gpu_allocator_ = nullptr;
std::mutex tensorrt_mu_;
int device_id_;
std::string compute_capability_;
Expand Down Expand Up @@ -351,6 +396,9 @@
std::unordered_set<std::string> control_flow_op_set_ = {"If", "Loop", "Scan"};
mutable std::unordered_map<std::string, std::unique_ptr<SubGraphContext>> subgraph_context_map_;

mutable std::unordered_set<std::string> dds_op_set_;
mutable bool is_dds_op_in_graph_ = false;

mutable std::unique_ptr<nvinfer1::IBuilder> builder_;

// Following maps that hold TRT objects will be accessible by different threads if ORT is using multithreading.
Expand Down Expand Up @@ -590,5 +638,12 @@
* This function only creates the instance at the first time it's being called."
*/
nvinfer1::IBuilder* GetBuilder(TensorrtLogger& trt_logger) const;

/**
* Check if DDS op is in the ComputeCapability/subgraph.
*/
bool IsDDSOpInSubGraph(const GraphViewer& graph,
std::vector<std::unique_ptr<ComputeCapability>>& compute_capabilities,

Check warning on line 646 in onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h

View workflow job for this annotation

GitHub Actions / Optional Lint C++

[cpplint] reported by reviewdog 🐶 Add #include <vector> for vector<> [build/include_what_you_use] [4] Raw Output: onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h:646: Add #include <vector> for vector<> [build/include_what_you_use] [4]
std::unordered_set<std::string>& dds_op_set) const;

Check warning on line 647 in onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h

View workflow job for this annotation

GitHub Actions / Optional Lint C++

[cpplint] reported by reviewdog 🐶 Add #include <string> for string [build/include_what_you_use] [4] Raw Output: onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h:647: Add #include <string> for string [build/include_what_you_use] [4]

Check warning on line 647 in onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h

View workflow job for this annotation

GitHub Actions / Optional Lint C++

[cpplint] reported by reviewdog 🐶 Add #include <unordered_set> for unordered_set<> [build/include_what_you_use] [4] Raw Output: onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h:647: Add #include <unordered_set> for unordered_set<> [build/include_what_you_use] [4]
};
} // namespace onnxruntime
Original file line number Diff line number Diff line change
Expand Up @@ -258,4 +258,22 @@

graph.SetInputs(graph_inputs_including_initializers);
}

// Check if DDS op is in the ComputeCapability/subgraph.
bool TensorrtExecutionProvider::IsDDSOpInSubGraph(const GraphViewer& graph,
std::vector<std::unique_ptr<ComputeCapability>>& compute_capabilities,

Check warning on line 264 in onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_helper.cc

View workflow job for this annotation

GitHub Actions / Optional Lint C++

[cpplint] reported by reviewdog 🐶 Add #include <vector> for vector<> [build/include_what_you_use] [4] Raw Output: onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_helper.cc:264: Add #include <vector> for vector<> [build/include_what_you_use] [4]
std::unordered_set<std::string>& dds_op_set) const {

Check warning on line 265 in onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_helper.cc

View workflow job for this annotation

GitHub Actions / Optional Lint C++

[cpplint] reported by reviewdog 🐶 Add #include <string> for string [build/include_what_you_use] [4] Raw Output: onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_helper.cc:265: Add #include <string> for string [build/include_what_you_use] [4]

Check warning on line 265 in onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_helper.cc

View workflow job for this annotation

GitHub Actions / Optional Lint C++

[cpplint] reported by reviewdog 🐶 Add #include <unordered_set> for unordered_set<> [build/include_what_you_use] [4] Raw Output: onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_helper.cc:265: Add #include <unordered_set> for unordered_set<> [build/include_what_you_use] [4]
auto is_dds_op = [&](const auto& node) {
if (dds_op_set.find(node->OpType()) != dds_op_set.end()) return true;
return false;
};

for (auto& compute_capability : compute_capabilities) {
auto& indexed_sub_graph = compute_capability->SubGraph();
for (auto i : indexed_sub_graph->Nodes()) {
if (is_dds_op(graph.GetNode(i))) return true;
}
}
return false;
}
} // namespace onnxruntime
Loading