基于tensorflow 2.x 搭建Faster RCNN网络训练自己的数据

前言暑期实习做了一些目标检测方面的工作按照目标检测的发展史也逐步进行了学习但是总感觉看论文只是纸上谈兵训练模型也只是调用模块中的函数fit别人准备好的数据这个过程中没有自己的东西。恰好呢看了Faster RCNN的论文 Faster RCNN又是一个使用anchor、RPN网络、端到端训练的经典的算法又想学习一下tensorflow 更恰巧手边有一些实际项目的数据所以历时一个月左右管理零碎的时间有限的资源跑通了Faster RCNN网络取得了理想的效果。

当然也不是完全自己复现搭建网络过程中参考了这位老哥的文章对其中的实现细节进行了更深入的了解并对训练过程中出现的问题以及解决方法进行了进一步记录。

1、utils.py实用程序函数说明

导入需要的包 wandhG数组存放9个anchor先验框的高宽尺寸是基于训练数据集中的gt框进行聚类生成的聚类生成先验anchor框。输入图片的尺寸为512*512 可自行调整想计算速度快一点的就设置小一点的图像尺寸。

import numpy as np
import cv2
from xml.dom.minidom import parse
import tensorflow as tf
# box width and height
wandhG np.array([[ 45.5 , 48.47058824],
 [ 48.5 , 105.17647059],
 [ 91.5 , 76.23529412],
 [ 60., 103.52941177],
 [112.25 , 48.],
 [ 75. , 96. ],
 [ 24. , 26.82352941],
 [107. , 61.17647059],
 [ 87. , 26.35294118]], dtype np.float32)
image_height 512
image_width 512

load_gt_boxes函数将图片的标注文件进行解析可解析labelimg标注的xml文件以及yolov格式的txt文件最终返回一张图像上的多个gt框的label以及左上和右下角坐标。

def load_gt_boxes(path):
 load the ground truth bounding box info: label, xmin, ymin, xmax, ymax
 ## parse xml file
 # dom_tree parse(path)
 # root element
 # root_node dom_tree.documentElement
 # print( root node , root_node.nodeName)
 # # extract image size
 # size root_node.getElementsByTagName( size )
 # # size info
 # width size[0].getElementsByTagName( width )[0].childNodes[0].data
 # height size[0].getElementsByTagName( height )[0].childNodes[0].data
 # depth size[0].getElementsByTagName( depth )[0].childNodes[0].data
 # print([int(width), int(height), int(depth)])
 # extract BB objects
 # objects root_node.getElementsByTagName( object )
 # boxes []
 # for obj in objects:
 # # name obj.getElementsByTagName( name )[0].childNodes[0].data
 # bndbox obj.getElementsByTagName( bndbox )[0]
 # xmin int(bndbox.getElementsByTagName( xmin )[0].childNodes[0].data)
 # ymin int(bndbox.getElementsByTagName( ymin )[0].childNodes[0].data)
 # xmax int(bndbox.getElementsByTagName( xmax )[0].childNodes[0].data)
 # ymax int(bndbox.getElementsByTagName( ymax )[0].childNodes[0].data)
 # # w np.abs(xmax - xmin)
 # # h np.abs(ymax - ymin)
 # boxes.append([xmin, ymin, xmax, ymax])
 # boxes np.array(boxes)
 # return boxes
 ## parse txt files
 boxes []
 with open(path, r ) as f:
 lines f.readlines()
 for line in lines:
 data line.split( )
 x_center np.float64(data[1])*2*image_width
 y_center np.float64(data[2])*2*image_height
 w np.float64(data[3])*image_width
 h np.float64(data[4])*image_height
 xmin (x_center - w)/2
 xmax (x_center w)/2
 ymin (y_center - h)/2
 ymax (y_center h)/2
 boxes.append([xmin, ymin, xmax, ymax])
 return boxes

plot_boxes_on_image函数将boxes坐标绘制在图片上并返回RGB格式的图像。(可测试坐标数据解析是否正确)

def plot_boxes_on_image(image_with_boxes, boxes, thickness 2, color [255, 0, 0]):
 plot boxes on image 
 boxes np.array(boxes).astype(np.int32)
 for box in boxes:
 cv2.rectangle(image_with_boxes, pt1 (box[0], box[1]), pt2 (box[2], box[3]), color color, thickness thickness)
 image_with_boxes cv2.cvtColor(image_with_boxes, cv2.COLOR_BGR2RGB)
 return image_with_boxes

compute_iou计算两个坐标框的交并比 iou是衡量预测框和gt框的重合和接近程度 iou越接近1 预测框和gt框越接近。

def compute_iou(box1, box2):
 compute the IOU(Intersection Over Union)
 :param box1:
 :param box2:
 :return: iou
 w_1 box1[2] - box1[0]
 h_1 box1[3] - box1[1]
 w_2 box2[2] - box2[0]
 h_2 box2[3] - box2[1]
 x [box1[0], box1[2], box2[0], box2[2]]
 y [box1[1], box1[3], box2[1], box2[3]]
 delta_x np.max(x) - np.min(x)
 delta_y np.max(y) - np.min(y)
 w_in w_1 w_2 - delta_x
 h_in h_1 h_2 - delta_y
 if w_in 0 or h_in 0:
 iou 0
 else:
 area_in w_in*h_in
 area_un w_1*h_1 w_2*h_2 - area_in
 iou area_in/area_un
 return iou

regression_box_shift函数计算检测到目标并且得分大于positive_threshold 于gt框的交并比大于iou阈值的proposal框向ground_truth框的变换量 tx,ty为坐标平移量 tw th为高度和宽度的缩放量。一定要注意变换的顺序要不然训练和测试的时候会发现候选框离目标框越来越远得分越来越低 loss越来越爆炸。

def regression_box_shift(p, g):
 compute t to transform p to g
 :param p: proposal box
 :param g: ground truth
 :return: t
 w_p p[2] - p[0]
 h_p p[3] - p[1]
 w_g g[2] - g[0]
 h_g g[3] - g[1]
 tx (g[0] - p[0])/w_p
 ty (g[1] - p[1])/h_p
 tw np.log(w_g/w_p)
 th np.log(h_g/h_p)
 t [tx, ty, tw, th]
 return t

output_decode函数对预测的boxes和得分进行解码。根据Faster RCNN的网络结构图像经过backbone网络进行了4次Maxpool 最后得到的feature map大小为输入图像尺寸的十六分之一也就是512/16 32。feature map中的每一个像素对应原输入图像上的一个16*16大小的grid。此函数先计算原输入图像上的每个grid的中心坐标以及以此坐标为中心的9个anchor框的坐标。再将anchor先验框与预测得到的变换量进行变换得到所有anchor的预测框在经过预测框得分的阈值筛选得到最终的预测框和对应得分。

def output_decode(pred_bboxes, pred_scores, score_thresh 0.5):
 grid_x, grid_y tf.range(32, dtype tf.int32), tf.range(32, dtype tf.int32)
 grid_x, grid_y tf.meshgrid(grid_x, grid_y)
 grid_x, grid_y tf.expand_dims(grid_x, -1), tf.expand_dims(grid_y, -1)
 grid_xy tf.stack([grid_x, grid_y], axis -1)
 center_xy grid_xy * 16 8
 center_xy tf.cast(center_xy, tf.float32)
 anchor_xymin center_xy - 0.5 * wandhG
 anchor_xymin np.expand_dims(anchor_xymin, axis 0)
 # print(anchor_xymin.shape)
 xy_min pred_bboxes[..., 0:2] * wandhG[:, 0:2] anchor_xymin
 xy_max tf.exp(pred_bboxes[..., 2:4]) * wandhG[:, 0:2] xy_min
 pred_bboxes tf.concat([xy_min, xy_max], axis -1)
 pred_scores pred_scores[..., 1]
 score_mask pred_scores score_thresh
 pred_bboxes tf.reshape(pred_bboxes[score_mask], shape [-1, 4]).numpy()
 pred_scores tf.reshape(pred_scores[score_mask], shape [-1, ]).numpy()
 return pred_bboxes, pred_scores

nms函数为非极大抑制(Non-Maximum Suppression)过程目的是筛选每张图像每个目标的预测框中得分最高的框并滤除与之重合的框。

def nms(pred_boxes, pred_score, iou_threshold):
 Non-Maximum Suppression 
 nms_boxes []
 while len(pred_boxes) 0:
 max_id np.argmax(pred_score)
 selected_box pred_boxes[max_id]
 nms_boxes.append(selected_box)
 del pred_boxes[max_id]
 del pred_score[max_id]
 ious compute_iou(selected_box, pred_boxes)
 iou_mask ious iou_threshold
 pred_boxes pred_boxes[iou_mask]
 pred_score pred_score[iou_mask]
 nms_boxes np.array(nms_boxes)
 return nms_boxes

2、demo.py测试上述函数

这其中for循环为代码主要部分其对每个anchor框进行遍历步骤是先计算每个anchor框的坐标检验其是否超出边界接着计算anchor框与此张图像中的所有gt框的交并比根据正反例iou阈值判断是否检测到目标并相应进行更新target_boxes, target_scores, target_mask三个tensor(numpy) 其中target_boxes只有在检测到目标时进行更新并选取与之交并比最大的gt框计算坐标偏移量。最终的效果和直接将标注框绘制在图像上无异啦即说明前面的代码是正常运行的。

这里是一个坑如果你的图像数据中存在待检测目标位于图像边缘区域即待检测目标很小一部分位于图像内这时就会出现anchor框易超出边界的情况超出图像边界一定范围的anchor框都会被过滤掉进一步造成训练过程中出现nan的情况。

import matplotlib.pyplot as plt
import cv2
from utils import load_gt_boxes, compute_iou, regression_box_shift, nms, output_decode, wandhG, plot_boxes_on_image
import numpy as np
# 标记为正例的阈值(检测到目标)
pos_thresh 0.5
# 标记为反例的阈值(未检测到目标)
neg_thresh 0.1
iou_thresh 0.5
image_height 512
image_width 512
grid_height 16
grid_width 16
# 测试样例
label_path 2821.txt 
img_path 2821.png 
gt_boxes load_gt_boxes(label_path)
raw_img cv2.imread(img_path)
img_boxes np.copy(raw_img)
print(gt_boxes)
img_with_boxes plot_boxes_on_image(img_boxes, np.array(gt_boxes)*2)
plt.figure()
plt.imshow(img_with_boxes)
plt.show()
# 初始化预测框坐标 得分 以及是否检测到目标的mask
# shape对应32*32的feature map上每一个像素对应原图16*16的grid 每一个grid对应9个anchor 每个anchor有4个坐标
# 得分中为检测到目标的正例得分和未检测到目标的得分
# mask中检测到目标记为1 未检测到目标记为-1 其它记为0
target_boxes np.zeros(shape [32, 32, 9, 4])
target_scores np.zeros(shape [32, 32, 9, 2])
target_mask np.zeros(shape [32, 32, 9])
 ********************************* 
 *********将feature map分成32*32个小块 
encoding_img np.copy(raw_img)
encoding_img cv2.resize(encoding_img, dsize (512, 512), interpolation cv2.INTER_CUBIC)
for i in range(32):
 for j in range(32):
 for k in range(9):
 center_y i*grid_height grid_height*0.5
 center_x j*grid_width grid_width*0.5
 # calculate the cordinates
 xmin center_x - wandhG[k][0]*0.5
 xmax center_x wandhG[k][0]*0.5
 ymin center_y - wandhG[k][1]*0.5
 ymax center_y wandhG[k][1]*0.5
 # filter the cross-boundary anchors
 if (xmin -5) (ymin -5) (xmax (image_width 5)) (ymax (image_height 5)):
 anchor_boxes np.array([xmin, ymin, xmax, ymax])
 # print(anchor_boxes)
 anchor_boxes np.expand_dims(anchor_boxes, axis 0)
 print(anchor_boxes)
 # compute iou between anchor_box and gt
 ious []
 for gt_box in gt_boxes:
 iou compute_iou(anchor_boxes[0], gt_box)
 ious.append(iou)
 ious np.array(ious)
 positive_masks ious pos_thresh
 negative_masks ious neg_thresh
 # identify positive or negative
 if np.any(positive_masks):
 plot_boxes_on_image(encoding_img, anchor_boxes, thickness 1)
 cv2.circle(encoding_img, center (int(0.5 * (xmin xmax)), int(0.5 * (ymin ymax))), radius 1,
 color [255, 0, 0], thickness 1)
 # 标记检测到物体
 target_scores[i, j, k, 1] 1
 target_mask[i, j, k] 1
 # 找出最匹配此anchor box的gt
 max_iou_id np.argmax(ious)
 selected_gt_boxes gt_boxes[max_iou_id]
 target_boxes[i, j, k] regression_box_shift(anchor_boxes[0], selected_gt_boxes)
 if np.all(negative_masks):
 target_scores[i, j, k, 0] 0
 target_mask[i, j, k] -1
 cv2.circle(encoding_img, center (int(0.5 * (xmin xmax)), int(0.5 * (ymin ymax))), radius 1,
 color [0, 0, 0], thickness 1)
cv2.namedWindow( encoded image , cv2.WINDOW_NORMAL)
cv2.imshow( encoded image , encoding_img)
cv2.waitKey(0)
# cv2.imwrite( encoding_img.png , encoding_img)
# print(target_boxes)
faster_decode_img np.copy(raw_img)
pred_boxes np.expand_dims(target_boxes, 0).astype(np.float32)
pred_scores np.expand_dims(target_scores, 0).astype(np.float32)
pred_boxes, pred_scores output_decode(pred_boxes, pred_scores, 0.9)
nms_pred_boxes nms(pred_boxes, pred_scores, 0.1)
img_with_predbox plot_boxes_on_image(faster_decode_img, pred_boxes*2, color [255, 0, 0], thickness 1)
cv2.namedWindow( pred_img , cv2.WINDOW_NORMAL)
cv2.imshow( pred_img , img_with_predbox)
cv2.waitKey(0)
cv2.imwrite( img_demo.png , img_with_predbox)

3、rpn.py搭建Faster RCNN网络

继承keras中的model并重写call方法进行Faster RCNN网络的搭建其中在RPN网络层中参考文章中的kernel_size为[5, 2] 暂时没弄清楚为什么要这样设置难道是为了使得RPN网络产生的预测框更倾向于细长形的由于自己数据集的关系将kernel_size设置成了[3 3]。最终网络返回对应的预测框坐标以及得分。

import tensorflow as tf
from tensorflow.keras.layers import Conv2D, Dropout, BatchNormalization, MaxPool2D, Flatten, Dense, InputLayer
print(tf.__version__)

基于tensorflow 2.x 搭建Faster RCNN网络训练自己的数据

Python相关栏目本月热门文章