Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

MNN速度推理速度缓慢 #3150

Open
Nightmare4214 opened this issue Jan 6, 2025 · 4 comments
Open

MNN速度推理速度缓慢 #3150

Nightmare4214 opened this issue Jan 6, 2025 · 4 comments
Labels
User The user ask question about how to use. Or don't use MNN correctly and cause bug.

Comments

@Nightmare4214
Copy link

Nightmare4214 commented Jan 6, 2025

平台(如果交叉编译请再附上交叉编译目标平台):

linux

Github版本:

python3.10.13
torch2.5.1
onnx1.17.0
onnxruntime1.20.1
onnxscript0.1.0.dev20241231
MNN3.03

问题

模型:

from torch import nn
class MultiVarCnnV3(nn.Module):
    def __init__(self):
        super(MultiVarCnnV3, self).__init__()
        self.cnn = nn.Sequential(
            nn.Conv1d(in_channels=54, out_channels=32, kernel_size=3, stride=1, padding="same"),
            nn.ReLU(),
            nn.BatchNorm1d(32),
            nn.MaxPool1d(2, 2),

            nn.Conv1d(in_channels=32, out_channels=64, kernel_size=3, stride=1, padding="same"),
            nn.ReLU(),
            nn.BatchNorm1d(64),
            nn.MaxPool1d(2, 2),

            nn.Conv1d(in_channels=64, out_channels=32, kernel_size=3, stride=1, padding="same"),
            nn.ReLU(),
            nn.BatchNorm1d(32),
            nn.MaxPool1d(2, 2),
        )
        self.linear = nn.Sequential(
            nn.Flatten(), 
            
            nn.Linear(4096, 2048),
            nn.ReLU(),
            nn.Dropout(0.2),
            
            nn.Linear(2048, 1024),
            nn.ReLU(),
            nn.Dropout(0.2),
            
            nn.Linear(1024, 1)
        )
        self._initialize_weights()

    @torch.no_grad()
    def _initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv1d):
                nn.init.normal_(m.weight, std=0.01)
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.BatchNorm1d):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.ConvTranspose1d):
                nn.init.normal_(m.weight, std=0.01)
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.Embedding):
                m.weight.data.normal_(mean=0.0, std=1.0)
                if m.padding_idx is not None:
                    m.weight.data[m.padding_idx].zero_()
            elif isinstance(m, nn.LayerNorm):
                # m.weight.data.normal_(mean=0.0, std=1.0)
                nn.init.kaiming_uniform_(
                    m.weight, a=0, mode="fan_in", nonlinearity="relu"
                )
                if m.bias is not None:
                    m.bias.data.zero_()

    def forward(self, x):
        x = self.cnn(x)

        return self.linear(x).squeeze(-1)  # (B,)

转为onnx

dummy_input = torch.zeros(input_shape, dtype=torch.float32)
torch.onnx.export(model,         # model being run 
        dummy_input,       # model input (or a tuple for multiple inputs) 
        "MultiVarCnn.onnx",       # where to save the model  
        export_params=True,  # store the trained parameter weights inside the model file 
        # dynamo=True,
        opset_version=20,    # the ONNX version to export the model to 
        # do_constant_folding=True,  # whether to execute constant folding for optimization 
        input_names = ['modelInput'],   # the model's input names 
        output_names = ['modelOutput'], # the model's output names 
        # dynamic_axes={'modelInput' : {0 : 'batch_size'},    # variable length axes 
        #               'modelOutput' : {0 : 'batch_size'}}
)

转为MNN

python -m MNN.tools.mnnconvert -f ONNX --modelFile MultiVarCnn.onnx --MNNModel MultiVarCnn.mnn --bizCode MNN

推理

Pytorch

with torch.no_grad():
    # warmup
    for _ in range(50):
        torch_y = model(torch.rand(input_shape, dtype=torch.float32)).item()

    total_time = 0
    for _ in range(1000):
        cur = torch.rand(input_shape, dtype=torch.float32)
        start = time()
        torch_y = model(cur).item()
        end = time()
        total_time += end - start

    print("total_time: avg: {}, sum: {}".format(total_time / 1000., total_time))

ONNX

ort_sess = ort.InferenceSession("MultiVarCnn.onnx")

# warmup
for _ in range(50):
    onnx_y = ort_sess.run(None, {"modelInput": torch.rand(input_shape, dtype=torch.float32).numpy()})[0].item() # (B,)

total_time = 0
for _ in range(1000):
    cur = torch.rand(input_shape, dtype=torch.float32)
    start = time()
    onnx_y = ort_sess.run(None, {"modelInput": cur.numpy()})[0].item() # (B,)
    end = time()
    total_time += end - start

print("total_time: avg: {}, sum: {}".format(total_time / 1000., total_time))

MNN-interpreter

interpreter = MNN.Interpreter("MultiVarCnn.mnn")
config = {}
# config["precision"] = "low" # 当硬件支持(armv8.2)时使用fp16推理
config["backend"] = 0       # CPU
config["numThread"] = 4     # 线程数
session = interpreter.createSession(config)
input_tensors = interpreter.getSessionInput(session)
output_tensors = interpreter.getSessionOutput(session)

# warmup
for _ in range(50):
    input_tensors.copyFrom( MNN.Tensor(input_shape, MNN.Halide_Type_Float, torch.rand(input_shape, dtype=torch.float32).numpy(), MNN.Tensor_DimensionType_Caffe))
    # 执行推理
    interpreter.runSession(session)
    # 转numpy
    # mnn_y = output_var.read()
    mnn_y = output_tensors.getData()[0]
convert_time = 0
infer_time = 0
out_time = 0
for _ in range(1000):
    cur = torch.rand(input_shape, dtype=torch.float32)
    start = time()
    input_tensors.copyFrom( MNN.Tensor(input_shape, MNN.Halide_Type_Float, cur.numpy(), MNN.Tensor_DimensionType_Caffe))
    end = time()
    convert_time += end - start
    # 执行推理
    start = time()
    interpreter.runSession(session)
    end = time()
    infer_time += end - start
    # 转numpy
    start = time()
    mnn_y = output_tensors.getData()[0]
    end = time()
    out_time += end - start

total_time = convert_time + infer_time + out_time
print("convert_time: avg: {}, sum: {}".format(convert_time / 1000., convert_time))
print("infer_time: avg: {}, sum: {}".format(infer_time / 1000., infer_time))
print("out_time: avg: {}, sum: {}".format(out_time / 1000., out_time))
print("total_time: avg: {}, sum: {}".format(total_time / 1000., total_time))

MNN-expr

config = {}
# config["precision"] = "low" # 当硬件支持(armv8.2)时使用fp16推理
config["backend"] = 0       # CPU
config["numThread"] = 4     # 线程数
rt = mnn_nn.create_runtime_manager((config,))
# 加载模型创建_Module
net = mnn_nn.load_module_from_file("MultiVarCnn.mnn", ["modelInput"], ["modelOutput"], runtime_manager=rt)
# warmup
for _ in range(50):
    # TODO 直接转numpy为mnn_np
    input_var = mnn_expr.const(torch.rand(input_shape, dtype=torch.float32).tolist(), input_shape, mnn_expr.NCHW, mnn_expr.float)
    # 执行推理
    output_var = net.forward(input_var)
    # 转numpy
    # mnn_y = output_var.read()
    mnn_y = output_var[0]
convert_time = 0
infer_time = 0
out_time = 0
for _ in range(1000):
    cur = torch.rand(input_shape, dtype=torch.float32)
    # TODO 直接转numpy为mnn_np
    start = time()
    input_var = mnn_expr.const(cur.numpy(), input_shape, mnn_expr.NCHW, mnn_expr.float)
    end = time()
    convert_time += end - start
    # 执行推理
    start = time()
    output_var = net.forward(input_var)
    end = time()
    infer_time += end - start
    # 转numpy
    start = time()
    mnn_y = output_var[0]
    end = time()
    out_time += end - start

total_time = convert_time + infer_time + out_time
print("convert_time: avg: {}, sum: {}".format(convert_time / 1000., convert_time))
print("infer_time: avg: {}, sum: {}".format(infer_time / 1000., infer_time))
print("out_time: avg: {}, sum: {}".format(out_time / 1000., out_time))
print("total_time: avg: {}, sum: {}".format(total_time / 1000., total_time))

MNN-array

config = {}
# config["precision"] = "low" # 当硬件支持(armv8.2)时使用fp16推理
config["backend"] = 0       # CPU
config["numThread"] = 4     # 线程数
rt = mnn_nn.create_runtime_manager((config,))
# 加载模型创建_Module
net = mnn_nn.load_module_from_file("MultiVarCnn.mnn", ["modelInput"], ["modelOutput"], runtime_manager=rt)
# warmup
for _ in range(50):
    # TODO 直接转numpy为mnn_np
    input_var = mnn_np.array(torch.rand(input_shape, dtype=torch.float32).tolist(), dtype=mnn_np.float32)
    # 执行推理
    output_var = net.forward(input_var)
    # 转numpy
    # mnn_y = output_var.read()
    mnn_y = output_var[0]
convert_time = 0
infer_time = 0
out_time = 0
for _ in range(1000):
    cur = torch.rand(input_shape, dtype=torch.float32)
    # TODO 直接转numpy为mnn_np
    start = time()
    input_var = mnn_np.array(cur.tolist(), dtype=mnn_np.float32)
    end = time()
    convert_time += end - start
    # 执行推理
    start = time()
    output_var = net.forward(input_var)
    end = time()
    infer_time += end - start
    # 转numpy
    start = time()
    mnn_y = output_var[0]
    end = time()
    out_time += end - start

total_time = convert_time + infer_time + out_time
print("convert_time: avg: {}, sum: {}".format(convert_time / 1000., convert_time))
print("infer_time: avg: {}, sum: {}".format(infer_time / 1000., infer_time))
print("out_time: avg: {}, sum: {}".format(out_time / 1000., out_time))
print("total_time: avg: {}, sum: {}".format(total_time / 1000., total_time))

速度对比

预热50次,推理1000次速度(总和):

Model Time(s)
torch 0.8798
onnx 0.5565
MNN-interpreter 1.2777
MNN-expr 1.1739
MNN-array 4.3277

看起来:onnx<torch<MNN-expr<MNN-interpreter<MNN-array
MNN貌似没有想象中的那么快,可能是什么原因呢?

@jxt1234
Copy link
Collaborator

jxt1234 commented Jan 7, 2025

cur.tolist() 这些转换操作非常耗时,建议先对比 infer 时间

@jxt1234 jxt1234 added the User The user ask question about how to use. Or don't use MNN correctly and cause bug. label Jan 7, 2025
@jxt1234
Copy link
Collaborator

jxt1234 commented Jan 7, 2025

数据预处理建议完全由 mnn 的 cv / numpy 库执行,避免冗余的数据转换耗时

@Nightmare4214
Copy link
Author

但是直接转expr的MNN速度好像速度也不如onnx

@jxt1234
Copy link
Collaborator

jxt1234 commented Jan 7, 2025

mnn_expr.const(torch.rand(input_shape, dtype=torch.float32).tolist(), input_shape, mnn_expr.NCHW, mnn_expr.float)

这句本身耗时也挺高的。单独测试模型推理速度吧

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
User The user ask question about how to use. Or don't use MNN correctly and cause bug.
Projects
None yet
Development

No branches or pull requests

2 participants