yolov5 解码使用GPU进行加速

YOLOv5原理方面这里不再过多阐述，直接从输出头开始，然后设计如编解码：

1.yolov5系列的原始输出是3个head头，上图画的是输入为608*608的分辨率的图，如果输入改为640*640分辨率的图片，那么输出的3个头分别对应三个8、16、32下采样的输出分别为80*80*255、40*40*255、20*20*255，其中对应的数字意义如上图所示。

2.那么 80*80*255、40*40*255、20*20*255数字分别代表什么意思，其中B是batch

3 上图输出的3个head，并不是最终的输出，还需要做很多的工作，如果直接这样输出，后续代码解码很麻烦，因此需要进一步的处理这三个头，以此方便后面的代码进行解码操作，具体做以下工作：

3.1 需要做sigmoid激活函数

3.2 xy*2-0.5

3.3 (wh*2)**2*anchor

3.4 拿到640尺度下的框

从中可以看到需要很多种操作，很麻烦，可以让onnx来做，因此为了更好的在连续空间可以访问到，可以通过变换以下输出的通道，即原来的B*3*85*80*80,可以变换为B*3*80*80*85, 得到这样的tensor，可以很容易的进行操作，但是因为存在三个头，还是很麻烦，那么还可以继续合并，即B*19200*85，那么其他的三个头类似：

此时需要修改yolo导出的python代码使其支持onnx的导出：

其中修改python的代码在E:projectc++yolov5-mastermodelsyolo.py

def forward(self, x):
        z = []  # inference output
        for i in range(self.nl):
            x[i] = self.m[i](x[i])  # conv
            # bs, _, ny, nx = x[i].shape  # x(bs,255,20,20) to x(bs,3,20,20,85)
            bs, _, ny, nx = map(int, x[i].shape)  # x(bs,255,20,20) to x(bs,3,20,20,85)
            # x[i] = x[i].view(bs, self.na, self.no, ny, nx).permute(0, 1, 3, 4, 2).contiguous()
            x[i] = x[i].view(-1, self.na, self.no, ny, nx).permute(0, 1, 3, 4, 2).contiguous()


            if not self.training:  # inference
                if self.grid[i].shape[2:4] != x[i].shape[2:4] or self.onnx_dynamic:
                    self.grid[i] = self._make_grid(nx, ny).to(x[i].device)

                y = x[i].sigmoid()
                # if self.inplace:
                if self.inplace:
                    y[..., 0:2] = (y[..., 0:2] * 2. - 0.5 + self.grid[i]) * self.stride[i]  # xy
                    y[..., 2:4] = (y[..., 2:4] * 2) ** 2 * self.anchor_grid[i]  # wh
                else:  # for YOLOv5 on AWS Inferentia https://github.com/ultralytics/yolov5/pull/2953
                    xy = (y[..., 0:2] * 2. - 0.5 + self.grid[i]) * self.stride[i]  # xy
                    wh = (y[..., 2:4] * 2) ** 2 * self.anchor_grid[i].view(1, self.na, 1, 1, 2)  # wh
                    y = torch.cat((xy, wh, y[..., 4:]), -1)
                z.append(y.view(bs, -1, self.no))

        return x if self.training else (torch.cat(z, 1), x)

导出完成后的onnx应该为如下：

以上就是得到onnx前的工作，解码完成后应该使用tensorrt进行推理，整个代码在我的github中，

下面主要把关键的代码贴出来：

1.预处理核函数

    static __global__ void warp_affine_bilinear_and_normalize_plane_kernel(uint8_t* src, int src_line_size, int src_width, int src_height, float* dst, int dst_width, int dst_height, 
        uint8_t const_value_st, float* warp_affine_matrix_2_3, Norm norm, int edge){

        

        
        int position = blockDim.x * blockIdx.x + threadIdx.x;
        if (position >= edge) return;
        
        float m_x1 = warp_affine_matrix_2_3[0];
        float m_y1 = warp_affine_matrix_2_3[1];
        float m_z1 = warp_affine_matrix_2_3[2];
        float m_x2 = warp_affine_matrix_2_3[3];
        float m_y2 = warp_affine_matrix_2_3[4];
        float m_z2 = warp_affine_matrix_2_3[5];
        
        int dx      = position % dst_width;
        int dy      = position / dst_width;
        
        float src_x = m_x1 * dx + m_y1 * dy + m_z1;
        float src_y = m_x2 * dx + m_y2 * dy + m_z2;
        float c0, c1, c2;

        
        if(src_x <= -1 || src_x >= src_width || src_y <= -1 || src_y >= src_height){
            // out of range
            c0 = const_value_st;
            c1 = const_value_st;
            c2 = const_value_st;
        }else{
            
            int y_low = floorf(src_y);
            int x_low = floorf(src_x);
            int y_high = y_low + 1;
            int x_high = x_low + 1;
            
            uint8_t const_value[] = {const_value_st, const_value_st, const_value_st};
            float ly    = src_y - y_low;
            float lx    = src_x - x_low;
            float hy    = 1 - ly;
            float hx    = 1 - lx;
            float w1    = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
            uint8_t* v1 = const_value;
            uint8_t* v2 = const_value;
            uint8_t* v3 = const_value;
            uint8_t* v4 = const_value;
            if(y_low >= 0){
                if (x_low >= 0)
                    v1 = src + y_low * src_line_size + x_low * 3;

                if (x_high < src_width)
                    v2 = src + y_low * src_line_size + x_high * 3;
            }
            
            if(y_high < src_height){
                if (x_low >= 0)
                    v3 = src + y_high * src_line_size + x_low * 3;

                if (x_high < src_width)
                    v4 = src + y_high * src_line_size + x_high * 3;
            }
            
            c0 = floorf(w1 * v1[0] + w2 * v2[0] + w3 * v3[0] + w4 * v4[0] + 0.5f);
            c1 = floorf(w1 * v1[1] + w2 * v2[1] + w3 * v3[1] + w4 * v4[1] + 0.5f);
            c2 = floorf(w1 * v1[2] + w2 * v2[2] + w3 * v3[2] + w4 * v4[2] + 0.5f);
        }

        if(norm.channel_type == ChannelType::SwapRB){
            float t = c2;
            c2 = c0;  c0 = t;
        }

        if(norm.type == NormType::MeanStd){
            c0 = (c0 * norm.alpha - norm.mean[0]) / norm.std[0];
            c1 = (c1 * norm.alpha - norm.mean[1]) / norm.std[1];
            c2 = (c2 * norm.alpha - norm.mean[2]) / norm.std[2];
        }else if(norm.type == NormType::AlphaBeta){
            c0 = c0 * norm.alpha + norm.beta;
            c1 = c1 * norm.alpha + norm.beta;
            c2 = c2 * norm.alpha + norm.beta;
        }
        
        int area = dst_width * dst_height;
        float* pdst_c0 = dst + dy * dst_width + dx;
        float* pdst_c1 = pdst_c0 + area;
        float* pdst_c2 = pdst_c1 + area;
        *pdst_c0 = c0;
        *pdst_c1 = c1;
        *pdst_c2 = c2;
    }

    static void warp_affine_bilinear_and_normalize_plane(
        uint8_t* src, int src_line_size, int src_width, int src_height, float* dst, int dst_width, int dst_height,
        float* matrix_2_3, uint8_t const_value, const Norm& norm,
        cudaStream_t stream) {
        
        
        int jobs   = dst_width * dst_height;
        auto grid  = grid_dims(jobs);
        auto block = block_dims(jobs);
        
        checkCudaKernel(warp_affine_bilinear_and_normalize_plane_kernel << > > (
            src, src_line_size,
            src_width, src_height, dst,
            dst_width, dst_height, const_value, matrix_2_3, norm, jobs
        ));
    }

2.解码核函数

const int NUM_BOX_ELEMENT = 7;      // left, top, right, bottom, confidence, class, keepflag
    static __device__ void affine_project(float* matrix, float x, float y, float* ox, float* oy){
        *ox = matrix[0] * x + matrix[1] * y + matrix[2];
        *oy = matrix[3] * x + matrix[4] * y + matrix[5];
    }
    
    
    static __global__ void decode_kernel(float* predict, int num_bboxes, int num_classes, float confidence_threshold, float* invert_affine_matrix, float* parray, int max_objects){  
   
        
  
        
        int position = blockDim.x * blockIdx.x + threadIdx.x;
        
        if (position >= num_bboxes) return;
        
        float* pitem     = predict + (5 + num_classes) * position;
        
        float objectness = pitem[4];
        
        if(objectness < confidence_threshold)
            return;
        
        float* class_confidence = pitem + 5;
        float confidence        = *class_confidence++;
        int label               = 0;
        
        for(int i = 1; i < num_classes; ++i, ++class_confidence){
            if(*class_confidence > confidence){
                confidence = *class_confidence;
                label      = i;
            }
        }
        
        confidence *= objectness;
        
        if(confidence < confidence_threshold)
            return;
        
        int index = atomicAdd(parray, 1);
        if(index >= max_objects)
            return;
        
        float cx         = *pitem++;
        float cy         = *pitem++;
        float width      = *pitem++;
        float height     = *pitem++;
        
        float left   = cx - width * 0.5f;
        float top    = cy - height * 0.5f;
        float right  = cx + width * 0.5f;
        float bottom = cy + height * 0.5f;
        
        affine_project(invert_affine_matrix, left,  top,    &left,  &top);
        affine_project(invert_affine_matrix, right, bottom, &right, &bottom);

        
        float* pout_item = parray + 1 + index * NUM_BOX_ELEMENT;
        *pout_item++ = left;
        *pout_item++ = top;
        *pout_item++ = right;
        *pout_item++ = bottom;
        *pout_item++ = confidence;
        *pout_item++ = label;
        *pout_item++ = 1; // 1 = keep, 0 = ignore
    }

    static __device__ float box_iou(
        float aleft, float atop, float aright, float abottom, 
        float bleft, float btop, float bright, float bbottom
    ){

        float cleft 	= max(aleft, bleft);
        float ctop 		= max(atop, btop);
        float cright 	= min(aright, bright);
        float cbottom 	= min(abottom, bbottom);
        
        float c_area = max(cright - cleft, 0.0f) * max(cbottom - ctop, 0.0f);
        if(c_area == 0.0f)
            return 0.0f;
        
        float a_area = max(0.0f, aright - aleft) * max(0.0f, abottom - atop);
        float b_area = max(0.0f, bright - bleft) * max(0.0f, bbottom - btop);
        return c_area / (a_area + b_area - c_area);
    }
    
    static __global__ void fast_nms_kernel(float* bboxes, int max_objects, float threshold){

        
        int position = (blockDim.x * blockIdx.x + threadIdx.x);
        
        int count = min((int)*bboxes, max_objects);
        if (position >= count) 
            return;
        
        
        // left, top, right, bottom, confidence, class, keepflag
        float* pcurrent = bboxes + 1 + position * NUM_BOX_ELEMENT;
        for(int i = 0; i < count; ++i){
            float* pitem = bboxes + 1 + i * NUM_BOX_ELEMENT;
            
            if(i == position || pcurrent[5] != pitem[5]) continue;
            
            if(pitem[4] >= pcurrent[4]){
                
                if(pitem[4] == pcurrent[4] && i < position)
                    continue;
                
                float iou = box_iou(
                    pcurrent[0], pcurrent[1], pcurrent[2], pcurrent[3],
                    pitem[0],    pitem[1],    pitem[2],    pitem[3]
                );
                
                if(iou > threshold){
                    pcurrent[6] = 0;  // 1=keep, 0=ignore
                    return;
                }
            }
            
        }
    } 

    static void decode_kernel_invoker(float* predict, int num_bboxes, int num_classes, float confidence_threshold, float nms_threshold, float* invert_affine_matrix, float* parray, int max_objects, cudaStream_t stream){
        
        

        auto grid = grid_dims(num_bboxes);
        auto block = block_dims(num_bboxes);
        
        
        checkCudaKernel(decode_kernel<<>>(predict, num_bboxes, num_classes, confidence_threshold, invert_affine_matrix, parray, max_objects));
        
        grid = grid_dims(max_objects);
        block = block_dims(max_objects);
        checkCudaKernel(fast_nms_kernel<<>>(parray, max_objects, nms_threshold));
    }

yolov5 解码使用GPU进行加速

Python相关栏目本月热门文章