YOLOV5加速之TensorRT Python版API构建模型

看到几篇文章转YOLOV5到TRT时基本都在用C++构建，实际上TRT也有Python版本的API，自己试着搞了下也能用效果一样，下面贴下代码：
from collections import OrderedDict

import tensorrt as trt
import torch
from numpy import ceil
import numpy as np

TRT_LOGGER = trt.Logger(trt.Logger.WARNING)

INPUT_BLOB_NAME = "data"
OUTPUT_BLOB_NAME = "prob"

INPUT_H = 640
INPUT_W = 320
CLASS_NUM = 6


def get_width(x, gw, divisor=8):
    return int(ceil((x * gw) / divisor)) * divisor


def get_depth(x, gd):
    if x == 1:
        return 1
    r = round(x * gd)
    if x * gd - int(x * gd) == 0.5 and (int(x * gd) % 2) == 0:
        r -= 1
    return max(r, 1)


def autopad(k, p=None):  # kernel, padding
    # Pad to 'same'
    if p is None:
        p = k // 2 if isinstance(k, int) else [x // 2 for x in k]  # auto-pad
    return p


def addBatchNorm2D(network, weights, input, layer_name):
    gamma = weights[layer_name + '.weight'].numpy()
    beta = weights[layer_name + '.bias'].numpy()
    mean = weights[layer_name + '.running_mean'].numpy()
    var = weights[layer_name + '.running_var'].numpy()
    eps = 1e-05
    var = np.sqrt(var + eps)
    scale = gamma / var
    shift = - mean / var * gamma + beta
    scale1 = network.add_scale(input, trt.ScaleMode.CHANNEL, shift, scale)
    return scale1


def convBlock(network, weights, input, out_channel, ksize, s, g, layer_name):
    p = autopad(ksize)
    conv1 = network.add_convolution_nd(input, out_channel, (ksize, ksize), weights[layer_name + '.conv.weight'].numpy())
    conv1.stride_nd = (s, s)
    conv1.padding_nd = (p, p)
    conv1.num_groups = g
    bn1 = addBatchNorm2D(network, weights, conv1.get_output(0), layer_name + ".bn")
    sigmoid_ = network.add_activation(bn1.get_output(0), trt.ActivationType.SIGMOID)
    ew = network.add_elementwise(bn1.get_output(0), sigmoid_.get_output(0), trt.ElementWiseOperation.PROD)
    # return bn1
    return ew


def focus(network, weights, input, in_channel, out_channel, ksize, layer_name):
    s1 = network.add_slice(input, (0, 0, 0), (in_channel, INPUT_H // 2, INPUT_W // 2), (1, 2, 2))
    s2 = network.add_slice(input, (0, 1, 0), (in_channel, INPUT_H // 2, INPUT_W // 2), (1, 2, 2))
    s3 = network.add_slice(input, (0, 0, 1), (in_channel, INPUT_H // 2, INPUT_W // 2), (1, 2, 2))
    s4 = network.add_slice(input, (0, 1, 1), (in_channel, INPUT_H // 2, INPUT_W // 2), (1, 2, 2))
    cat = network.add_concatenation([s1.get_output(0), s2.get_output(0), s3.get_output(0), s4.get_output(0)])
    conv = convBlock(network, weights, cat.get_output(0), out_channel, ksize, 1, 1, layer_name + '.conv')
    return conv


def bottleneck(network, weights, input, c1, c2, shortcut, g, e, layer_name):
    cv1 = convBlock(network, weights, input, int(c2 * e), 1, 1, 1, layer_name + '.cv1')
    cv2 = convBlock(network, weights, cv1.get_output(0), c2, 3, 1, g, layer_name + '.cv2')
    if shortcut and c1 == c2:
        ew = network.add_elementwise(input, cv2.get_output(0), trt.ElementWiseOperation.SUM)
        return ew
    return cv2


def C3(network, weights, input, c1, c2, n, shortcut, g, e, layer_name):
    c_ = int(c2 * e)
    cv1 = convBlock(network, weights, input, c_, 1, 1, 1, layer_name + '.cv1')
    cv2 = convBlock(network, weights, input, c_, 1, 1, 1, layer_name + '.cv2')
    y1 = cv1.get_output(0)
    for i in range(n):
        b = bottleneck(network, weights, y1, c_, c_, shortcut, g, 1.0, layer_name + '.m.' + str(i))
        y1 = b.get_output(0)
    cat = network.add_concatenation([y1, cv2.get_output(0)])
    cv3 = convBlock(network, weights, cat.get_output(0), c2, 1, 1, 1, layer_name + '.cv3')
    return cv3


def SPP(network, weights, input, c1, c2, k1, k2, k3, layer_name):
    c_ = c1 // 2
    cv1 = convBlock(network, weights, input, c_, 1, 1, 1, layer_name + ".cv1")
    pool1 = network.add_pooling_nd(cv1.get_output(0), trt.PoolingType.MAX, (k1, k1))
    pool1.padding_nd = (k1 // 2, k1 // 2)
    pool1.stride_nd = (1, 1)

    pool2 = network.add_pooling_nd(cv1.get_output(0), trt.PoolingType.MAX, (k2, k2))
    pool2.padding_nd = (k2 // 2, k2 // 2)
    pool2.stride_nd = (1, 1)

    pool3 = network.add_pooling_nd(cv1.get_output(0), trt.PoolingType.MAX, (k3, k3))
    pool3.padding_nd = (k3 // 2, k3 // 2)
    pool3.stride_nd = (1, 1)

    cat = network.add_concatenation([cv1.get_output(0), pool1.get_output(0), pool2.get_output(0), pool3.get_output(0)])
    cv2 = convBlock(network, weights, cat.get_output(0), c2, 1, 1, 1, layer_name + '.cv2')
    return cv2


def addYoLoLayer(network, weights, layer_name, input):
    return network


pt_file = "/yolov5l.pt"
model = torch.load(pt_file, map_location="cpu")['model']  # load to FP32
new_state_dictBA = OrderedDict()
for k, v in model.items():
    if k[:7] == 'module.':
        name = k[7:]  # remove `module.`
    else:
        name = k
    new_state_dictBA[name] = v
model_weights = new_state_dictBA
# print(model.state_dict().keys())

with trt.Builder(TRT_LOGGER) as builder, builder.create_network() as network:
    gw = 1.
    gd = 1.
    input_tensor = network.add_input(name=INPUT_BLOB_NAME, dtype=trt.float32, shape=(3, 640, 320))
    # identity = network.add_identity(input_tensor)
    focus0 = focus(network, model_weights, input_tensor, 3, get_width(64, gw), 3, "model.0")
    conv1 = convBlock(network, model_weights, focus0.get_output(0), get_width(128, gw), 3, 2, 1, "model.1")
    bottleneck_CSP2 = C3(network, model_weights, conv1.get_output(0),
                         get_width(128, gw), get_width(128, gw),
                         get_depth(3, gd), True, 1, 0.5, "model.2")
    conv3 = convBlock(network, model_weights, bottleneck_CSP2.get_output(0), get_width(256, gw), 3, 2, 1, "model.3")
    bottleneck_csp4 = C3(network, model_weights, conv3.get_output(0), get_width(256, gw), get_width(256, gw),
                         get_depth(9, gd), True, 1, 0.5, "model.4")
    conv5 = convBlock(network, model_weights, bottleneck_csp4.get_output(0), get_width(512, gw), 3, 2, 1, "model.5")
    bottleneck_csp6 = C3(network, model_weights, conv5.get_output(0), get_width(512, gw), get_width(512, gw),
                         get_depth(9, gd), True, 1, 0.5, "model.6")
    conv7 = convBlock(network, model_weights, bottleneck_csp6.get_output(0), get_width(1024, gw), 3, 2, 1, "model.7")
    spp8 = SPP(network, model_weights, conv7.get_output(0), get_width(1024, gw), get_width(1024, gw),
               5, 9, 13, "model.8")
    bottleneck_csp9 = C3(network, model_weights, spp8.get_output(0), get_width(1024, gw), get_width(1024, gw),
                         get_depth(3, gd), False, 1, 0.5, "model.9")
    conv10 = convBlock(network, model_weights, bottleneck_csp9.get_output(0), get_width(512, gw), 1, 1, 1, "model.10")
    upsample11 = network.add_resize(conv10.get_output(0))
    upsample11.resize_mode = trt.ResizeMode.NEAREST
    upsample11.shape = bottleneck_csp6.get_output(0).shape
    cat12 = network.add_concatenation([upsample11.get_output(0), bottleneck_csp6.get_output(0)])
    bottleneck_csp13 = C3(network, model_weights, cat12.get_output(0), get_width(1024, gw), get_width(512, gw),
                          get_depth(3, gd), False, 1, 0.5, "model.13")
    conv14 = convBlock(network, model_weights, bottleneck_csp13.get_output(0), get_width(256, gw), 1, 1, 1, "model.14")
    upsample15 = network.add_resize(conv14.get_output(0))
    upsample15.resize_mode = trt.ResizeMode.NEAREST
    upsample15.shape = bottleneck_csp4.get_output(0).shape
    cat16 = network.add_concatenation([upsample15.get_output(0), bottleneck_csp4.get_output(0)])
    bottleneck_csp17 = C3(network, model_weights, cat16.get_output(0), get_width(512, gw), get_width(256, gw),
                          get_depth(3, gd), False, 1, 0.5, "model.17")

    det0 = network.add_convolution_nd(bottleneck_csp17.get_output(0), int(3 * (CLASS_NUM + 5)), (1, 1),
                                      model_weights["model.24.m.0.weight"].numpy(), model_weights["model.24.m.0.bias"].numpy())
    conv18 = convBlock(network, model_weights, bottleneck_csp17.get_output(0), get_width(256, gw), 3, 2, 1, "model.18")
    cat19 = network.add_concatenation([conv18.get_output(0), conv14.get_output(0)])

    bottleneck_csp20 = C3(network, model_weights, cat19.get_output(0), get_width(512, gw), get_width(512, gw),
                          get_depth(3, gd), False, 1, 0.5, "model.20")
    det1 = network.add_convolution_nd(bottleneck_csp20.get_output(0), int(3 * (CLASS_NUM + 5)), (1, 1),
                                      model_weights["model.24.m.1.weight"].numpy(), model_weights["model.24.m.1.bias"].numpy())
    conv21 = convBlock(network, model_weights, bottleneck_csp20.get_output(0), get_width(512, gw), 3, 2, 1, "model.21")
    cat22 = network.add_concatenation([conv21.get_output(0), conv10.get_output(0)])

    bottleneck_csp23 = C3(network, model_weights, cat22.get_output(0), get_width(1024, gw), get_width(1024, gw),
                          get_depth(3, gd), False, 1, 0.5, "model.23")
    det2 = network.add_convolution_nd(bottleneck_csp23.get_output(0), int(3 * (CLASS_NUM + 5)), (1, 1),
                                      model_weights["model.24.m.2.weight"].numpy(), model_weights["model.24.m.2.bias"].numpy())
    # yolo = addYoLoLayer(network, model_weights, "model.24", [det0, det1, det2])
    # yolo.get_output(0).name = OUTPUT_BLOB_NAME
    network.mark_output(det0.get_output(0))
    # network.mark_output(det1.get_output(0))
    # network.mark_output(det2.get_output(0))
    builder.max_batch_size = 144
    # builder.
    config = builder.create_builder_config()
    engine = builder.build_engine(network, config)
    binary_model = engine.serialize()
    with open('./yolov5l.engine', 'wb') as fp:
        fp.write(binary_model)
代码暂时没有封装，使用没有问题，后期会维护。
YOLOV5加速之TensorRT Python版API构建模型

Python相关栏目本月热门文章