xavier平台上YOLOV5的TensorRT加速

1.普通TensorRT加速版本

这个版本就是用tensorRT进行加速，参考的是https://github.com/zerollzeng/tiny-tensorrt，修改后的代码如下

1.1 trt.h

#ifndef TRT_HPP
#define TRT_HPP
#include 
#include 
#include 
#include 
#include 
#include "opencv2/opencv.hpp"
#include 
#include 
#include 
struct Bbox {
    int left, right, top, bottom;
    int clsId;
    float score;
};
class TrtLogger : public nvinfer1::ILogger {
    void log(Severity severity, const char* msg) override
    {
        // suppress info-level messages
        if (severity != Severity::kVERBOSE)
            std::cout << msg << std::endl;
    }
};
class Trt {
public:
    Trt();

    ~Trt();
    void SaveEngine(const std::string& fileName);
    void myBuildEngine(nvinfer1::IBuilder* builder,
                      nvinfer1::INetworkDefinition* network);
    void DEngine(const std::string& engineFile);
    void img_process(std::string file_list,nvinfer1::Dims3 input_dim,std::vector>& out_data);
    void Forward();
    void detect(cv::Mat img,std::vector& result, cudaStream_t mystream);
    void detect1(cv::Mat img,std::vector& result);
    bool BuildEngineWithonnx(const std::string& onnxModel,const std::string& engineFile);
    void ForwardAsync( cudaStream_t& stream);
    void DataTransfer(std::vector& data, int bindIndex, bool isHostToDevice);
    void DataTransferAsync(std::vector& data, int bindIndex, bool isHostToDevice, cudaStream_t& stream);
    void CopyFromHostToDevice(const std::vector& input, int bindIndex);
    void CopyFromDeviceToHost(std::vector& output, int bindIndex);
    void CopyFromHostToDevice(const std::vector& input, int bindIndex, cudaStream_t& stream);
    void CopyFromDeviceToHost(std::vector& output, int bindIndex, cudaStream_t& stream);
    void SetDevice(int device);
    void InitEngine();
    int GetDevice() const;

    
    int GetMaxBatchSize() const;

    
    void* GetBindingPtr(int bindIndex) const;

    
    size_t GetBindingSize(int bindIndex) const;

    
    nvinfer1::Dims GetBindingDims(int bindIndex) const;

    
    nvinfer1::DataType GetBindingDataType(int bindIndex) const;

    std::vector mBindingName;
    std::vector mBinding;
    std::vector mBindingSize;
    std::vector mBindingDims;
    std::vector mBindingDataType;
protected:

    bool DeserializeEngine(const std::string& engineFile);
    

protected:
    TrtLogger mLogger;
    // tensorrt run mode 0:fp32 1:fp16 2:int8
    int mRunMode;
    nvinfer1::ICudaEngine* mEngine = nullptr;
    nvinfer1::IExecutionContext* mContext = nullptr;
    nvinfer1::IRuntime* mRuntime = nullptr;

    int mInputSize = 0;
    std::mutex g_load_mtx;
    int mBatchSize;
};

#endif

1.2 yolov5_utils.h

#ifndef UTILS_H
#define UTILS_H

#include 
#include 
#include 
#include "cuda_runtime.h"
#include 

#define UNUSED(unusedVariable) (void)(unusedVariable)
// suppress compiler warning: unused parameter

inline int64_t volume(const nvinfer1::Dims& d)
{
    return std::accumulate(d.d, d.d + d.nbDims, 1, std::multiplies());
}

inline unsigned int getElementSize(nvinfer1::DataType t)
{
    switch (t)
    {
        case nvinfer1::DataType::kINT32: return 4;
        case nvinfer1::DataType::kFLOAT: return 4;
        case nvinfer1::DataType::kHALF: return 2;
        case nvinfer1::DataType::kINT8: return 1;
        default: throw std::runtime_error("Invalid DataType.");
    }
}


#ifndef CUDA_CHECK
#define CUDA_CHECK(callstr)                                                                    
    {                                                                                          
        cudaError_t error_code = callstr;                                                      
        if (error_code != cudaSuccess) {                                                       
            std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__ << std::endl; 
            exit(0);                                                                         
        }                                                                                      
    }
#endif

//#ifndef CUDA_CHECK_STATE(msg)
//#define CUDA_CHECK_STATE(const char *msg)                                                                    
//    {                                                                                          
//        cudaError_t error = cudaGetLastError();
//        if (error != cudaSuccess) {
//
//        std::cerr << "WAHT" << cudaGetErrorString(error) << std::endl;
//        exit(EXIT_FAILURE);
//        }                                                                                    
//    }
//#endif

inline void* safeCudaMalloc(size_t memSize) {
    void* deviceMem;
    CUDA_CHECK(cudaMalloc(&deviceMem, memSize));
    if (deviceMem == nullptr) {
        std::cerr << "Out of memory" << std::endl;
        exit(1);
    }
    return deviceMem;
}

inline void safeCudaFree(void* deviceMem) {
    CUDA_CHECK(cudaFree(deviceMem));
}

inline void error(const std::string& message, const int line, const std::string& function, const std::string& file) {
    std::cout << message << " at " << line << " in " << function << " in " << file << std::endl;
}
#define COMPILE_TEMPLATE_BASIC_TYPES_CLASS(className) COMPILE_TEMPLATE_BASIC_TYPES(className, class)
#define COMPILE_TEMPLATE_BASIC_TYPES_STRUCT(className) COMPILE_TEMPLATE_BASIC_TYPES(className, struct)
#define COMPILE_TEMPLATE_BASIC_TYPES(className, classType) 
    template classType  className; 
    template classType  className; 
    template classType  className; 
    template classType  className; 
    template classType  className; 
    template classType  className; 
    template classType  className; 
    template classType  className; 
    template classType  className; 
    template classType  className; 
    template classType  className; 
    template classType  className; 
    template classType  className; 
    template classType  className

// const auto CUDA_NUM_THREADS = 512u;
// inline unsigned int getNumberCudaBlocks(const unsigned int totalRequired,
//     const unsigned int numberCudaThreads = CUDA_NUM_THREADS)
// {
// return (totalRequired + numberCudaThreads - 1) / numberCudaThreads;
// }

struct YoloKernel;

static constexpr int LOCATIONS = 4;
struct alignas(float) Detection{
    //x y w h
    float bbox[LOCATIONS];
    //float objectness;
    float classId;
    float prob;
};

#endif

1.3 trt.cpp

#include "trt.h"
#include 
#include 

#include 
#include "yolov5_utils.h"
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 

int detect_size=640;
int len=11;
using namespace std;
using namespace cv;
using namespace nvinfer1;
void Donms(std::vector& detections,int classes ,float nmsThresh)
{
    using namespace std;
    // auto t_start = chrono::high_resolution_clock::now();
    std::vector> resClass;
    resClass.resize(classes);
    for (const auto& item : detections)
        resClass[item.clsId].push_back(item);

    // auto iouCompute = [](float * lbox, float* rbox)
    // {
    //     float interBox[] = {
    //             max(lbox[0] - lbox[2]/2.f , rbox[0] - rbox[2]/2.f), //left
    //             min(lbox[0] + lbox[2]/2.f , rbox[0] + rbox[2]/2.f), //right
    //             max(lbox[1] - lbox[3]/2.f , rbox[1] - rbox[3]/2.f), //top
    //             min(lbox[1] + lbox[3]/2.f , rbox[1] + rbox[3]/2.f), //bottom
    //     };

    //     if(interBox[2] > interBox[3] || interBox[0] > interBox[1])
    //         return 0.0f;

    //     float interBoxS =(interBox[1]-interBox[0])*(interBox[3]-interBox[2]);
    //     return interBoxS/(lbox[2]*lbox[3] + rbox[2]*rbox[3] -interBoxS);
    // };
    auto iouCompute = [](int x1,int x2,int y1,int y2,int x3,int x4,int y3,int y4 )
    {
        int interBox[] = {
                max(x1 , x3), //left
                min(x2 , x4), //right
                max(y1,y3), //top
                min(y2 ,y4), //bottom
        };

        if(interBox[2] > interBox[3] || interBox[0] > interBox[1])
            return 0.0f;

        float interBoxS =(interBox[1]-interBox[0])*(interBox[3]-interBox[2]);
        return interBoxS/((x2-x1)*(y2-y1) + (x4-x3)*(y4-y3) -interBoxS);
    };
    std::vector result;
    for (int i = 0;i right.score;
        });

        for (unsigned int m = 0;m < dets.size() ; ++m)
        {
            auto& item = dets[m];
            result.push_back(item);
            for(unsigned int n = m + 1;n < dets.size() ; ++n)
            {
                if (iouCompute(item.left,item.right,item.top,item.bottom,dets[n].left,dets[n].right,dets[n].top,dets[n].bottom) > nmsThresh)
                {
                    dets.erase(dets.begin()+n);
                    --n;
                }
            }
        }
    }

    //swap(detections,result);
    detections = move(result);

    // auto t_end = chrono::high_resolution_clock::now();
    // float total = chrono::duration(t_end - t_start).count();
    // cout << "Time taken for nms is " << total << " ms." << endl;
}
void Trt::SaveEngine(const std::string& fileName) {
    if(fileName == "") {
        cout<<("empty engine file name, skip save");
        return;
    }
    if(mEngine != nullptr) {
        
        nvinfer1::IHostMemory* data = mEngine->serialize();
        std::ofstream file;
        file.open(fileName,std::ios::binary | std::ios::out);
        if(!file.is_open()) {
            return;
        }
        file.write((const char*)data->data(), data->size());
        file.close();
        data->destroy();
    } else {
       cout<<("engine is empty, save engine failed");
    }
}
void Trt::detect1(cv::Mat img,std::vector& result){

    int h1=img.rows;
    cout<<"h is"< out1,out2,out3;
    if(out_dims.size()==1)
    {
        out1.resize(out_dims[0],0);
    } else{
        out1.resize(out_dims[0],0);
        out2.resize(out_dims[1],0);
        out3.resize(out_dims[2],0);
    }
    std::vector data_in(inlen,0);
    float* data=data_in.data();
    cv::Mat sample_resized;
    cv::resize(img, sample_resized, cv::Size(detect_size, detect_size),0,0,cv::INTER_CUBIC);
    cv::Mat sample=sample_resized;
    cv::Mat sample_float;
    if (channel == 3)
        sample.convertTo(sample_float, CV_32FC3);
    else
        sample.convertTo(sample_float, CV_32FC1);
    vector input_channels;
    sample_float=sample_float/255.0;
    cv::split(sample_float, input_channels);
    memcpy(data,input_channels[2].data,detect_size*detect_size*sizeof(float));
    data+=detect_size*detect_size;
    memcpy(data,input_channels[1].data,detect_size*detect_size*sizeof(float));
    data+=detect_size*detect_size;
    memcpy(data,input_channels[0].data,detect_size*detect_size*sizeof(float));
    CopyFromHostToDevice(data_in,0);
    Forward();
    CopyFromDeviceToHost(out1,1);
    
    
    for(int i=0;i0.5)
        {
            Bbox tem_result;
            tem_result.clsId=int(out1[i*len+5]);
            tem_result.score=float(out1[i*len+4]);
            float lf=out1[i*len]-out1[i*len+2]/2;
            float rg=out1[i*len]+out1[i*len+2]/2;
            float tp=out1[i*len+1]-out1[i*len+3]/2;
            float bt=out1[i*len+1]+out1[i*len+3]/2;
            float scale=float(detect_size)/w1;
            float h_scale=float(detect_size)/h1;
            tem_result.left=int(lf/scale);
            tem_result.right=int(rg/scale);
            tem_result.top=int(tp/h_scale);
            tem_result.bottom=int(bt/h_scale);
            result.push_back(tem_result);
        }
        
    }
    cout<<"before size "<& result, cudaStream_t mystream){

    
    int size=mBinding.size();
    vector out_dims;
    nvinfer1::Dims in_DIms=mBindingDims[0];
    for(int i=1;i out1,out2,out3;
    if(out_dims.size()==1)
    {
        out1.resize(out_dims[0],0);
    } else{
        out1.resize(out_dims[0],0);
        out2.resize(out_dims[1],0);
        out3.resize(out_dims[2],0);
    }
    std::vector data_in(inlen,0);
    int h1=img.rows;
    
    int w1=img.cols;
    cv::Mat sample_resized;
    float scale = std::min(float(detect_size) / w1, float(detect_size) / h1);
    auto scaleSize = cv::Size(int(img.cols * scale), int(img.rows * scale));
    cv::resize(img, sample_resized, scaleSize, 0, 0, cv::INTER_CUBIC);
    cv::Mat cropped(detect_size, detect_size, CV_8UC3, 127);
    cv::Rect rect(0, 0, scaleSize.width, scaleSize.height);
    sample_resized.copyTo(cropped(rect));
    float* data=data_in.data();
    cv::Mat sample=cropped;
    cv::Mat sample_float;
    if (channel == 3)
        sample.convertTo(sample_float, CV_32FC3);
    else
        sample.convertTo(sample_float, CV_32FC1);
    vector input_channels;
    sample_float=sample_float/255.0;
    cv::split(sample_float, input_channels);
    memcpy(data,input_channels[2].data,detect_size*detect_size*sizeof(float));
    data+=detect_size*detect_size;
    memcpy(data,input_channels[1].data,detect_size*detect_size*sizeof(float));
    data+=detect_size*detect_size;
    memcpy(data,input_channels[0].data,detect_size*detect_size*sizeof(float));

    CopyFromHostToDevice(data_in,0,mystream);
    ForwardAsync(mystream);
    CopyFromDeviceToHost(out1,1,mystream);
    
    
    for(int i=0;i0.5)
	    //if(out1[i*len+4]>0.001)
        {
            Bbox tem_result;
            //tem_result.clsId=int(out1[i*len+5])+1;
            tem_result.clsId=int(out1[i*len+5]);
            tem_result.score=float(out1[i*len+4]);
            float lf=out1[i*len]-out1[i*len+2]/2;
            float rg=out1[i*len]+out1[i*len+2]/2;
            float tp=out1[i*len+1]-out1[i*len+3]/2;
            float bt=out1[i*len+1]+out1[i*len+3]/2;
            // float scale=float(detect_size)/w1;
            // float h_scale=float(detect_size)/h1;
            tem_result.left=int(lf/scale);
            tem_result.right=int(rg/scale);
            tem_result.top=int(tp/scale);
            tem_result.bottom=int(bt/scale);
 
            printf("tem_result.clsId:%dn", tem_result.clsId);
            printf("tem_result.left:%dn", tem_result.left);
            printf("tem_result.right:%dn", tem_result.right);
            printf("tem_result.top:%dn", tem_result.top);
            printf("tem_result.bottom:%dn", tem_result.bottom);

            result.push_back(tem_result);
        }
        
    }

    cout<<"before size "<destroy();
        mContext = nullptr;
    }
    if(mEngine !=nullptr) {
        mEngine->destroy();
        mEngine = nullptr;
    }
    for(size_t i=0;i& result, cudaStream_t mystream){

// }
void Trt::myBuildEngine(nvinfer1::IBuilder* builder,
                      nvinfer1::INetworkDefinition* network) {
    nvinfer1::IBuilderConfig* config = builder->createBuilderConfig();
    mBatchSize=1;       
    if (!builder->platformHasFastFp16()) {
            return;
        }    
    config->setFlag(nvinfer1::BuilderFlag::kFP16);
    builder->setMaxBatchSize(mBatchSize);
    // set the maximum GPU temporary memory which the engine can use at execution time.
    config->setMaxWorkspaceSize(1 << 22);
    mEngine = builder -> buildEngineWithConfig(*network, *config);
    assert(mEngine != nullptr);
    config->destroy();
}
bool Trt::DeserializeEngine(const std::string& engineFile) {

    std::ifstream in(engineFile.c_str(), std::ifstream::binary);
    cout<<"enginefile"<::max());
        size_t bufCount = in.gcount();
        in.seekg(start_pos);
        std::unique_ptr engineBuf(new char[bufCount]);
        in.read(engineBuf.get(), bufCount);
        initLibNvInferPlugins(&mLogger, "");
        mRuntime = nvinfer1::createInferRuntime(mLogger);
        mEngine = mRuntime->deserializeCudaEngine((void*)engineBuf.get(), bufCount, nullptr);
        assert(mEngine != nullptr);
        mBatchSize = mEngine->getMaxBatchSize();
       // spdlog::info("max batch size of deserialized engine: {}",mEngine->getMaxBatchSize());
        mRuntime->destroy();
        return true;
    }
    return false;
}
bool Trt::BuildEngineWithonnx(const std::string& onnxModel,const std::string& engineFile){

    nvinfer1::IBuilder* builder = nvinfer1::createInferBuilder(mLogger);
    assert(builder != nullptr);
    // NetworkDefinitionCreationFlag::kEXPLICIT_BATCH 
    const auto explicitBatch = 1U << static_cast(NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
    nvinfer1::INetworkDefinition* network = builder->createNetworkV2(explicitBatch);
    assert(network != nullptr);
    nvonnxparser::IParser* parser = nvonnxparser::createParser(*network, mLogger);
    if(!parser->parseFromFile(onnxModel.c_str(), static_cast(ILogger::Severity::kWARNING))) {
        return false;
    }
    for(int i=0;igetNbLayers();i++) {
        nvinfer1::ILayer* custom_output = network->getLayer(i);
        for(int j=0;jgetNbInputs();j++) {
            nvinfer1::ITensor* input_tensor = custom_output->getInput(j);
            //std::cout << input_tensor->getName() << " ";
        }
       // std::cout << " -------> ";
        for(int j=0;jgetNbOutputs();j++) {
            nvinfer1::ITensor* output_tensor = custom_output->getOutput(j);
            //std::cout << output_tensor->getName() << " ";
        }
       // std::cout << std::endl;
    }  
    myBuildEngine(builder, network);
    SaveEngine(engineFile);

    builder->destroy();
    network->destroy();
    parser->destroy();
    return true;
}
void Trt::InitEngine() {
    //::info("init engine...");
    mContext = mEngine->createExecutionContext();
    assert(mContext != nullptr);

   // spdlog::info("malloc device memory");
    int nbBindings = mEngine->getNbBindings();
    std::cout << "nbBingdings: " << nbBindings << std::endl;
    mBinding.resize(nbBindings);
    mBindingSize.resize(nbBindings);
    mBindingName.resize(nbBindings);
    mBindingDims.resize(nbBindings);
    mBindingDataType.resize(nbBindings);
    for(int i=0; i< nbBindings; i++) {
        nvinfer1::Dims dims = mEngine->getBindingDimensions(i);
        nvinfer1::DataType dtype = mEngine->getBindingDataType(i);
        const char* name = mEngine->getBindingName(i);
        int64_t totalSize = volume(dims) * mBatchSize * getElementSize(dtype);
//        int64_t totalSize = volume(dims) * getElementSize(dtype);
        mBindingSize[i] = totalSize;
        mBindingName[i] = name;
        mBindingDims[i] = dims;
        mBindingDataType[i] = dtype;
        if(mEngine->bindingIsInput(i)) {
         //   spdlog::info("input: ");
        } else {
           // spdlog::info("output: ");
        }
      //  spdlog::info("binding bindIndex: {}, name: {}, size in byte: {}",i,name,totalSize);
       // spdlog::info("binding dims with {} dimemsion",dims.nbDims);
        for(int j=0;jbindingIsInput(i)) {
            mInputSize++;
        }
    }

    mContext->enqueue(1, &mBinding[0], nullptr, nullptr);
//    void *p;
//    cudaHostAlloc(&p,mBindingSize[1],0);
//    cudaMemcpyAsync(&p,mBinding[1],mBindingSize[1],cudaMemcpyDeviceToHost);
//    vector tem_result;
//    decode_gpu(mBinding[1],yolo5_3,tem_result);
}

void Trt::Forward() {
    cudaEvent_t start,stop;
    float elapsedTime;
    mBatchSize=1;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, 0);
    mContext->execute(mBatchSize, &mBinding[0]);
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&elapsedTime, start, stop);
   // spdlog::info("net forward takes {} ms", elapsedTime);
}

void Trt::ForwardAsync(cudaStream_t& stream) {
    mContext->enqueue(mBatchSize, &mBinding[0], stream, nullptr);
}

void Trt::DataTransfer(std::vector& data, int bindIndex, bool isHostToDevice) {
    if(isHostToDevice) {
        assert(data.size()*sizeof(float) <= mBindingSize[bindIndex]);
        CUDA_CHECK(cudaMemcpy(mBinding[bindIndex], data.data(), data.size() * sizeof(float), cudaMemcpyHostToDevice));
    } else {
        data.resize(mBindingSize[bindIndex]/sizeof(float));
        CUDA_CHECK(cudaMemcpy(data.data(), mBinding[bindIndex], mBindingSize[bindIndex], cudaMemcpyDeviceToHost));
    }
}

void Trt::DataTransferAsync(std::vector& data, int bindIndex, bool isHostToDevice, cudaStream_t& stream) {
    if(isHostToDevice) {
        assert(data.size()*sizeof(float) <= mBindingSize[bindIndex]);
        CUDA_CHECK(cudaMemcpyAsync(mBinding[bindIndex], data.data(), data.size() * sizeof(float), cudaMemcpyHostToDevice, stream));
    } else {
        data.resize(mBindingSize[bindIndex]/sizeof(float));
        CUDA_CHECK(cudaMemcpyAsync(data.data(), mBinding[bindIndex], mBindingSize[bindIndex], cudaMemcpyDeviceToHost, stream));
    }
}

void Trt::CopyFromHostToDevice(const std::vector& input, int bindIndex) {
    CUDA_CHECK(cudaMemcpy(mBinding[bindIndex], input.data(), mBindingSize[bindIndex], cudaMemcpyHostToDevice));
}
void Trt::CopyFromHostToDevice(const std::vector& input, int bindIndex,  cudaStream_t& stream) {
    CUDA_CHECK(cudaMemcpyAsync(mBinding[bindIndex], input.data(), mBindingSize[bindIndex], cudaMemcpyHostToDevice, stream));
}
void Trt::CopyFromDeviceToHost(std::vector& output, int bindIndex) {
    CUDA_CHECK(cudaMemcpy(output.data(), mBinding[bindIndex], mBindingSize[bindIndex], cudaMemcpyDeviceToHost));
}
void Trt::CopyFromDeviceToHost(std::vector& output, int bindIndex,  cudaStream_t& stream) {
    CUDA_CHECK(cudaMemcpyAsync(output.data(), mBinding[bindIndex], mBindingSize[bindIndex], cudaMemcpyDeviceToHost, stream));
}
void Trt::SetDevice(int device) {
   // spdlog::warn("warning: make sure save engine file match choosed device");
    CUDA_CHECK(cudaSetDevice(device));
}
int Trt::GetDevice() const {
    int* device = nullptr; //NOTE: memory leaks here
    CUDA_CHECK(cudaGetDevice(device));
    if(device != nullptr) {
        return device[0];
    } else {
     //   spdlog::error("Get Device Error");
        return -1;
    }
}
int Trt::GetMaxBatchSize() const{
    return mBatchSize;
}
void* Trt::GetBindingPtr(int bindIndex) const {
    return mBinding[bindIndex];
}
size_t Trt::GetBindingSize(int bindIndex) const {
    return mBindingSize[bindIndex];
}
nvinfer1::Dims Trt::GetBindingDims(int bindIndex) const {
    return mBindingDims[bindIndex];
}
nvinfer1::DataType Trt::GetBindingDataType(int bindIndex) const {
    return mBindingDataType[bindIndex];
}

2.TensorRT加速再优化版本

参考文献：

https://github.com/zerollzeng/tiny-tensorrt

https://github.com/cumtchw/cuda_utils

xavier平台上YOLOV5的TensorRT加速

C/C++/C#相关栏目本月热门文章