Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(tensorrt yolox): inference and publish mask image from yolox model with semantic segmentation header #5553

Merged
Show file tree
Hide file tree
Changes from 36 commits
Commits
Show all changes
43 commits
Select commit Hold shift + click to select a range
767a936
add segmentation model
badai-nguyen Aug 24, 2023
8356196
fix: publish mask
badai-nguyen Aug 24, 2023
297528e
feat: publish colorized mask
badai-nguyen Aug 25, 2023
6dbbe77
fix: resize yolox mask
badai-nguyen Aug 29, 2023
f648980
fix: add memory allocate operations
manato Oct 6, 2023
3ab80e9
refactor: remove underscore for a local variable
manato Oct 6, 2023
b1f901f
chore: add condition to check the number of subscriber for newly adde…
manato Oct 6, 2023
57f7daf
chore: pre-commit
badai-nguyen Nov 10, 2023
b377d4b
fix: add roi overlapping segment
badai-nguyen Nov 20, 2023
f3d1bf6
fix: roi overlay segment
badai-nguyen Nov 20, 2023
295fae6
chore: refactor
badai-nguyen Nov 20, 2023
aafb438
docs: update readme
badai-nguyen Nov 20, 2023
c4cc56c
fix: update model name
badai-nguyen May 22, 2024
1266414
fix: add utils into tensorrt_yolox
badai-nguyen May 22, 2024
3488abb
fix: launch file
badai-nguyen May 22, 2024
b996b69
chore: remove unnecessary depend
badai-nguyen May 23, 2024
ffe2f7a
chore: fix yaml file
badai-nguyen May 23, 2024
126262b
chore: remove duplicated param in launch
badai-nguyen May 23, 2024
84da29c
fix: semantic class
badai-nguyen May 24, 2024
72de1e0
docs: update readme
badai-nguyen May 27, 2024
69cc160
chore: update default param
badai-nguyen May 27, 2024
d8e6ae7
fix: add processing time topic
badai-nguyen May 27, 2024
ea0d115
chore: typo
badai-nguyen May 29, 2024
6131b9c
Merge branch 'main' into feat/tensorrt_yolox/add_semantic_segmentation
badai-nguyen May 29, 2024
42f2101
style(pre-commit): autofix
pre-commit-ci[bot] May 29, 2024
d2b806d
Merge branch 'main' into feat/tensorrt_yolox/add_semantic_segmentation
badai-nguyen Jun 6, 2024
c5d0e1e
chore: fix cspell error
badai-nguyen Jun 7, 2024
5529227
fix: yolox default param
badai-nguyen Jun 11, 2024
90bdb42
chore: rename debug topics
badai-nguyen Jun 11, 2024
fa184f7
Merge branch 'main' into feat/tensorrt_yolox/add_semantic_segmentation
badai-nguyen Jun 12, 2024
3c470a8
Merge branch 'main' into feat/tensorrt_yolox/add_semantic_segmentation
badai-nguyen Jun 21, 2024
d70ca58
chore: rename debug topics
badai-nguyen Jun 21, 2024
1a2e2f7
docs: update model description
badai-nguyen Jun 21, 2024
c1ffa4a
style(pre-commit): autofix
pre-commit-ci[bot] Jun 21, 2024
ff05fb6
fix: launch
badai-nguyen Jun 21, 2024
2d32fcb
refactor: unpublish mask for single task
badai-nguyen Jun 21, 2024
ff14e52
Update perception/tensorrt_yolox/src/tensorrt_yolox.cpp
badai-nguyen Jun 23, 2024
9df6103
Update perception/tensorrt_yolox/src/tensorrt_yolox.cpp
badai-nguyen Jun 23, 2024
f305df2
Update perception/tensorrt_yolox/src/tensorrt_yolox.cpp
badai-nguyen Jun 23, 2024
174010e
style(pre-commit): autofix
pre-commit-ci[bot] Jun 23, 2024
d800691
docs: update reamde
badai-nguyen Jun 23, 2024
8832a3a
style(pre-commit): autofix
pre-commit-ci[bot] Jun 23, 2024
a15ac04
fix: skip mask size as yolox output
badai-nguyen Jun 24, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,9 @@
#ifndef TENSORRT_COMMON__TENSORRT_COMMON_HPP_
#define TENSORRT_COMMON__TENSORRT_COMMON_HPP_

#ifndef YOLOX_STANDALONE
#include <rclcpp/rclcpp.hpp>
#endif

#include <NvInfer.h>
#include <NvOnnxParser.h>
Expand Down Expand Up @@ -86,6 +88,7 @@ struct BuildConfig
profile_per_layer(profile_per_layer),
clip_value(clip_value)
{
#ifndef YOLOX_STANDALONE
if (
std::find(valid_calib_type.begin(), valid_calib_type.end(), calib_type_str) ==
valid_calib_type.end()) {
Expand All @@ -95,6 +98,7 @@ struct BuildConfig
<< "Default calibration type will be used: MinMax" << std::endl;
std::cerr << message.str();
}
#endif
}
};

Expand Down
83 changes: 60 additions & 23 deletions perception/tensorrt_yolox/README.md

Large diffs are not rendered by default.

22 changes: 22 additions & 0 deletions perception/tensorrt_yolox/config/yolox_s_plus_opt.param.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,29 @@
# cspell: ignore semseg
/**:
ros__parameters:
# refine segmentation mask by overlay roi class
# disable when sematic segmentation accuracy is good enough
is_roi_overlap_segment: true

# minimum existence_probability of detected roi considered to replace segmentation
overlap_roi_score_threshold: 0.3

# publish color mask for result visualization
is_publish_color_mask: false

roi_overlay_segment_label:
UNKNOWN : true
CAR : false
TRUCK : false
BUS : false
MOTORCYCLE : true
BICYCLE : true
PEDESTRIAN : true
ANIMAL: true

model_path: "$(var data_path)/tensorrt_yolox/$(var model_name).onnx"
label_path: "$(var data_path)/tensorrt_yolox/label.txt"
color_map_path: "$(var data_path)/tensorrt_yolox/semseg_color_map.csv"
score_threshold: 0.35
nms_threshold: 0.7
precision: "int8" # Operation precision to be used on inference. Valid value is one of: [fp32, fp16, int8].
Expand Down
17 changes: 16 additions & 1 deletion perception/tensorrt_yolox/include/tensorrt_yolox/preprocess.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,21 @@ extern void crop_resize_bilinear_letterbox_nhwc_to_nchw32_batch_gpu(
extern void multi_scale_resize_bilinear_letterbox_nhwc_to_nchw32_batch_gpu(
float * dst, unsigned char * src, int d_w, int d_h, int d_c, Roi * d_roi, int s_w, int s_h,
int s_c, int batch, float norm, cudaStream_t stream);
} // namespace tensorrt_yolox

/**
* @brief Argmax on GPU
* @param[out] dst processed image
* @param[in] src probability map
* @param[in] d_w width for output
* @param[in] d_h height for output
* @param[in] s_w width for input
* @param[in] s_h height for input
* @param[in] s_c channel for input
* @param[in] batch batch size
* @param[in] stream cuda stream
*/
extern void argmax_gpu(
unsigned char * dst, float * src, int d_w, int d_h, int s_w, int s_h, int s_c, int batch,
cudaStream_t stream);
} // namespace tensorrt_yolox
#endif // TENSORRT_YOLOX__PREPROCESS_HPP_
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,13 @@ struct GridAndStride
int stride;
};

typedef struct Colormap_
{
int id;
std::string name;
std::vector<unsigned char> color;
} Colormap;

/**
* @class TrtYoloX
* @brief TensorRT YOLOX for faster inference
Expand Down Expand Up @@ -85,7 +92,7 @@ class TrtYoloX
const bool use_gpu_preprocess = false, std::string calibration_image_list_file = std::string(),
const double norm_factor = 1.0, [[maybe_unused]] const std::string & cache_dir = "",
const tensorrt_common::BatchConfig & batch_config = {1, 1, 1},
const size_t max_workspace_size = (1 << 30));
const size_t max_workspace_size = (1 << 30), const std::string & color_map_path = "");
/**
* @brief Deconstruct TrtYoloX
*/
Expand All @@ -96,7 +103,9 @@ class TrtYoloX
* @param[out] objects results for object detection
* @param[in] images batched images
*/
bool doInference(const std::vector<cv::Mat> & images, ObjectArrays & objects);
bool doInference(
const std::vector<cv::Mat> & images, ObjectArrays & objects, std::vector<cv::Mat> & masks,
std::vector<cv::Mat> & color_masks);

/**
* @brief run inference including pre-process and post-process
Expand Down Expand Up @@ -130,6 +139,22 @@ class TrtYoloX
*/
void printProfiling(void);

/**
* @brief get num for multitask heads
*/
int getMultitaskNum(void);

/**
* @brief get colorized masks from index using specific colormap
* @param[out] cmask colorized mask
* @param[in] index multitask index
* @param[in] colormap colormap for masks
*/
void getColorizedMask(
const std::vector<tensorrt_yolox::Colormap> & colormap, const cv::Mat & mask,
cv::Mat & colorized_mask);
inline std::vector<Colormap> getColorMap() { return sematic_color_map_; }

private:
/**
* @brief run preprocess including resizing, letterbox, NHWC2NCHW and toFloat on CPU
Expand Down Expand Up @@ -177,7 +202,9 @@ class TrtYoloX
const cv::Mat & images, int batch_size, ObjectArrays & objects);

bool feedforward(const std::vector<cv::Mat> & images, ObjectArrays & objects);
bool feedforwardAndDecode(const std::vector<cv::Mat> & images, ObjectArrays & objects);
bool feedforwardAndDecode(
const std::vector<cv::Mat> & images, ObjectArrays & objects, std::vector<cv::Mat> & masks,
std::vector<cv::Mat> & color_masks);
void decodeOutputs(float * prob, ObjectArray & objects, float scale, cv::Size & img_size) const;
void generateGridsAndStride(
const int target_w, const int target_h, const std::vector<int> & strides,
Expand Down Expand Up @@ -206,6 +233,26 @@ class TrtYoloX
void nmsSortedBboxes(
const ObjectArray & face_objects, std::vector<int> & picked, float nms_threshold) const;

/**
* @brief get a mask image for a segmentation head
* @param[out] argmax argmax results
* @param[in] prob probability map
* @param[in] dims dimension for probability map
* @param[in] out_w mask width excluding letterbox
* @param[in] out_h mask height excluding letterbox
*/
cv::Mat getMaskImage(float * prob, nvinfer1::Dims dims, int out_w, int out_h);

/**
* @brief get a mask image on GPUs for a segmentation head
* @param[out] mask image
* @param[in] prob probability map on device
* @param[in] out_w mask width excluding letterbox
* @param[in] out_h mask height excluding letterbox
* @param[in] b current batch
*/
cv::Mat getMaskImageGpu(float * d_prob, nvinfer1::Dims dims, int out_w, int out_h, int b);

std::unique_ptr<tensorrt_common::TrtCommon> trt_common_;

std::vector<float> input_h_;
Expand Down Expand Up @@ -249,6 +296,20 @@ class TrtYoloX
CudaUniquePtrHost<Roi[]> roi_h_;
// device pointer for ROI
CudaUniquePtr<Roi[]> roi_d_;

// flag whether model has multitasks
int multitask_;
// buff size for segmentation heads
CudaUniquePtr<float[]> segmentation_out_prob_d_;
CudaUniquePtrHost<float[]> segmentation_out_prob_h_;
size_t segmentation_out_elem_num_;
size_t segmentation_out_elem_num_per_batch_;
std::vector<cv::Mat> segmentation_masks_;
// host buffer for argmax postprocessing on GPU
CudaUniquePtrHost<unsigned char[]> argmax_buf_h_;
// device buffer for argmax postprocessing on GPU
CudaUniquePtr<unsigned char[]> argmax_buf_d_;
std::vector<tensorrt_yolox::Colormap> sematic_color_map_;
};

} // namespace tensorrt_yolox
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
#ifndef TENSORRT_YOLOX__TENSORRT_YOLOX_NODE_HPP_
#define TENSORRT_YOLOX__TENSORRT_YOLOX_NODE_HPP_

#include "object_recognition_utils/object_recognition_utils.hpp"

#include <autoware/universe_utils/ros/debug_publisher.hpp>
#include <autoware/universe_utils/system/stop_watch.hpp>
#include <image_transport/image_transport.hpp>
Expand All @@ -25,6 +27,7 @@
#include <sensor_msgs/msg/image.hpp>
#include <std_msgs/msg/header.hpp>
#include <tier4_perception_msgs/msg/detected_objects_with_feature.hpp>
#include <tier4_perception_msgs/msg/semantic.hpp>

#if __has_include(<cv_bridge/cv_bridge.hpp>)
#include <cv_bridge/cv_bridge.hpp>
Expand All @@ -41,10 +44,30 @@

namespace tensorrt_yolox
{
// cspell: ignore Semseg
using LabelMap = std::map<int, std::string>;

using Label = tier4_perception_msgs::msg::Semantic;
class TrtYoloXNode : public rclcpp::Node
{
struct RoiOverlaySemsegLabel
{
bool UNKNOWN;
bool CAR;
bool TRUCK;
bool BUS;
bool MOTORCYCLE;
bool BICYCLE;
bool PEDESTRIAN;
bool ANIMAL;
bool isOverlay(const uint8_t label) const
{
return (label == Label::UNKNOWN && UNKNOWN) || (label == Label::CAR && CAR) ||
(label == Label::TRUCK && TRUCK) || (label == Label::BUS && BUS) ||
(label == Label::ANIMAL && ANIMAL) || (label == Label::MOTORBIKE && MOTORCYCLE) ||
(label == Label::BICYCLE && BICYCLE) || (label == Label::PEDESTRIAN && PEDESTRIAN);
};
}; // struct RoiOverlaySemsegLabel

public:
explicit TrtYoloXNode(const rclcpp::NodeOptions & node_options);

Expand All @@ -53,8 +76,12 @@ class TrtYoloXNode : public rclcpp::Node
void onImage(const sensor_msgs::msg::Image::ConstSharedPtr msg);
bool readLabelFile(const std::string & label_path);
void replaceLabelMap();

void overlapSegmentByRoi(const tensorrt_yolox::Object & object, cv::Mat & mask);
int mapRoiLabel2SegLabel(const int32_t roi_label_index);
image_transport::Publisher image_pub_;
image_transport::Publisher mask_pub_;
image_transport::Publisher color_mask_pub_;

rclcpp::Publisher<tier4_perception_msgs::msg::DetectedObjectsWithFeature>::SharedPtr objects_pub_;

image_transport::Subscriber image_sub_;
Expand All @@ -63,6 +90,21 @@ class TrtYoloXNode : public rclcpp::Node

LabelMap label_map_;
std::unique_ptr<tensorrt_yolox::TrtYoloX> trt_yolox_;
bool is_roi_overlap_segment_;
bool is_publish_color_mask_;
float overlap_roi_score_threshold_;
// TODO(badai-nguyen): change to function
std::map<std::string, int> remap_roi_to_semantic_ = {
{"UNKNOWN", 3}, // other
{"ANIMAL", 0}, // other
{"PEDESTRIAN", 6}, // person
{"CAR", 7}, // car
{"TRUCK", 7}, // truck
{"BUS", 7}, // bus
{"BICYCLE", 8}, // bicycle
{"MOTORBIKE", 8}, // motorcycle
};
RoiOverlaySemsegLabel roi_overlay_segment_labels_;
std::unique_ptr<autoware::universe_utils::StopWatch<std::chrono::milliseconds>> stop_watch_ptr_;
std::unique_ptr<autoware::universe_utils::DebugPublisher> debug_publisher_;
};
Expand Down
12 changes: 10 additions & 2 deletions perception/tensorrt_yolox/launch/yolox_s_plus_opt.launch.xml
Original file line number Diff line number Diff line change
@@ -1,22 +1,30 @@
<?xml version="1.0"?>
<launch>
<!-- cspell:ignore finetune -->
<!-- cspell: ignore semseg, finetune -->
<arg name="input/image" default="/sensing/camera/camera0/image_rect_color"/>
<arg name="output/objects" default="/perception/object_recognition/detection/rois0"/>
<arg name="model_name" default="yolox-sPlus-T4-960x960-pseudo-finetune"/>
<arg name="output/mask" default="/perception/object_recognition/detection/mask0"/>
<arg
name="model_name"
default="yolox-sPlus-opt-pseudoV2-T4-960x960-T4-seg16cls"
description="options `yolox-sPlus-T4-960x960-pseudo-finetune` if only detection is needed, `yolox-sPlus-opt-pseudoV2-T4-960x960-T4-seg16cls` if sematic segmentation is also needed"
/>
<arg name="data_path" default="$(env HOME)/autoware_data" description="packages data and artifacts directory path"/>
<arg name="yolox_param_path" default="$(find-pkg-share tensorrt_yolox)/config/yolox_s_plus_opt.param.yaml"/>
<arg name="use_decompress" default="true" description="use image decompress"/>
<arg name="build_only" default="false" description="exit after trt engine is built"/>

<arg name="param_file" default="$(find-pkg-share image_transport_decompressor)/config/image_transport_decompressor.param.yaml"/>
<node pkg="image_transport_decompressor" exec="image_transport_decompressor_node" name="image_transport_decompressor_node" if="$(var use_decompress)">
<remap from="~/input/compressed_image" to="$(var input/image)/compressed"/>
<remap from="~/output/raw_image" to="$(var input/image)"/>
<param from="$(var param_file)"/>
</node>

<node pkg="tensorrt_yolox" exec="tensorrt_yolox_node_exe" name="tensorrt_yolox" output="screen">
<remap from="~/in/image" to="$(var input/image)"/>
<remap from="~/out/objects" to="$(var output/objects)"/>
<remap from="~/out/mask" to="$(var output/mask)"/>
<param from="$(var yolox_param_path)" allow_substs="true"/>
<param name="build_only" value="$(var build_only)"/>
</node>
Expand Down
35 changes: 35 additions & 0 deletions perception/tensorrt_yolox/src/preprocess.cu
Original file line number Diff line number Diff line change
Expand Up @@ -594,4 +594,39 @@ void multi_scale_resize_bilinear_letterbox_nhwc_to_nchw32_batch_gpu(
cuda_gridsize(N), block, 0, stream>>>(N, dst, src, d_h, d_w, s_h, s_w, d_roi, norm, batch);
}

__global__ void argmax_gpu_kernel(
int N, unsigned char * dst, float * src, int dst_h, int dst_w, int src_c, int src_h, int src_w,
int batch)
{
// NHWC
int index = (blockIdx.x + blockIdx.y * gridDim.x) * blockDim.x + threadIdx.x;

if (index >= N) return;
int c = 0;
int w = index % dst_w;
int h = index / (dst_w);

int b;
for (b = 0; b < batch; b++) {
float max_prob = 0.0;
int max_index = 0;
int dst_index = w + dst_w * h + b * dst_h * dst_w;
for (c = 0; c < src_c; c++) {
int src_index = w + src_w * h + c * src_h * src_w + b * src_c * src_h * src_w;
max_index = max_prob < src[src_index] ? c : max_index;
max_prob = max_prob < src[src_index] ? src[src_index] : max_prob;
}
dst[dst_index] = max_index;
}
}

void argmax_gpu(
unsigned char * dst, float * src, int d_w, int d_h, int s_w, int s_h, int s_c, int batch,
cudaStream_t stream)
{
int N = d_w * d_h;
argmax_gpu_kernel<<<cuda_gridsize(N), block, 0, stream>>>(
N, dst, src, d_h, d_w, s_c, s_h, s_w, batch);
}

} // namespace tensorrt_yolox
Loading
Loading