和之前实现的YOLOv1一样,根据《YOLO目标检测》(ISBN:9787115627094)
一书,在不脱离YOLOv2的大部分核心理念的前提下,重构一款较新的YOLOv2检测器,来对YOLOV2有更加深刻的认识。
书中源码连接: RT-ODLab: YOLO Tutorial
对比原始YOLOV2网络,主要改进点如下:
添加了后续YOLO中使用的neck,即SPPF模块
使用普遍用在RetinaNet、FCOS、YOLOX等通用目标检测网络中的解耦检测头(Decoupled head)
修改损失函数,分类分支替换为BCE loss,回归分支替换为GIou loss。
由基于边界框的正样本匹配策略,改为基于先验框的正样本匹配策略。
对比之前实现的YOLOV1网络,主要改进点:
主干网络由ResNet18改为DarkNet19
添加先验框机制
正样本匹配策略改为:基于先验框的正样本匹配策略
YOLOv2代码和之前实现的YOLOv1相比,修改之处不多,建议先看之前实现的YOLOv1的相关文章。
# RT-ODLab/models/detectors/yolov2/yolov2_backbone.py
import torch
import torch.nn as nn
model_urls = {
"darknet19": "https://github.com/yjh0410/image_classification_pytorch/releases/download/weight/darknet19.pth",
}
__all__ = ['DarkNet19']
# --------------------- Basic Module -----------------------
class Conv_BN_LeakyReLU(nn.Module):
def __init__(self, in_channels, out_channels, ksize, padding=0, stride=1, dilation=1):
super(Conv_BN_LeakyReLU, self).__init__()
self.convs = nn.Sequential(
nn.Conv2d(in_channels, out_channels, ksize, padding=padding, stride=stride, dilation=dilation),
nn.BatchNorm2d(out_channels),
nn.LeakyReLU(0.1, inplace=True)
)
def forward(self, x):
return self.convs(x)
# --------------------- DarkNet-19 -----------------------
class DarkNet19(nn.Module):
def __init__(self):
super(DarkNet19, self).__init__()
# backbone network : DarkNet-19
# output : stride = 2, c = 32
self.conv_1 = nn.Sequential(
Conv_BN_LeakyReLU(3, 32, 3, 1),
nn.MaxPool2d((2,2), 2),
)
# output : stride = 4, c = 64
self.conv_2 = nn.Sequential(
Conv_BN_LeakyReLU(32, 64, 3, 1),
nn.MaxPool2d((2,2), 2)
)
# output : stride = 8, c = 128
self.conv_3 = nn.Sequential(
Conv_BN_LeakyReLU(64, 128, 3, 1),
Conv_BN_LeakyReLU(128, 64, 1),
Conv_BN_LeakyReLU(64, 128, 3, 1),
nn.MaxPool2d((2,2), 2)
)
# output : stride = 8, c = 256
self.conv_4 = nn.Sequential(
Conv_BN_LeakyReLU(128, 256, 3, 1),
Conv_BN_LeakyReLU(256, 128, 1),
Conv_BN_LeakyReLU(128, 256, 3, 1),
)
# output : stride = 16, c = 512
self.maxpool_4 = nn.MaxPool2d((2, 2), 2)
self.conv_5 = nn.Sequential(
Conv_BN_LeakyReLU(256, 512, 3, 1),
Conv_BN_LeakyReLU(512, 256, 1),
Conv_BN_LeakyReLU(256, 512, 3, 1),
Conv_BN_LeakyReLU(512, 256, 1),
Conv_BN_LeakyReLU(256, 512, 3, 1),
)
# output : stride = 32, c = 1024
self.maxpool_5 = nn.MaxPool2d((2, 2), 2)
self.conv_6 = nn.Sequential(
Conv_BN_LeakyReLU(512, 1024, 3, 1),
Conv_BN_LeakyReLU(1024, 512, 1),
Conv_BN_LeakyReLU(512, 1024, 3, 1),
Conv_BN_LeakyReLU(1024, 512, 1),
Conv_BN_LeakyReLU(512, 1024, 3, 1)
)
def forward(self, x):
c1 = self.conv_1(x) # c1
c2 = self.conv_2(c1) # c2
c3 = self.conv_3(c2) # c3
c3 = self.conv_4(c3) # c3
c4 = self.conv_5(self.maxpool_4(c3)) # c4
c5 = self.conv_6(self.maxpool_5(c4)) # c5
return c5
# --------------------- Fsnctions -----------------------
def build_backbone(model_name='darknet19', pretrained=False):
if model_name == 'darknet19':
# model
model = DarkNet19()
feat_dim = 1024
# load weight
if pretrained:
print('Loading pretrained weight ...')
url = model_urls['darknet19']
# checkpoint state dict
checkpoint_state_dict = torch.hub.load_state_dict_from_url(
url=url, map_location="cpu", check_hash=True)
# model state dict
model_state_dict = model.state_dict()
# check
for k in list(checkpoint_state_dict.keys()):
if k in model_state_dict:
shape_model = tuple(model_state_dict[k].shape)
shape_checkpoint = tuple(checkpoint_state_dict[k].shape)
if shape_model != shape_checkpoint:
checkpoint_state_dict.pop(k)
else:
checkpoint_state_dict.pop(k)
print(k)
model.load_state_dict(checkpoint_state_dict)
return model, feat_dim
if __name__ == '__main__':
import time
model, feat_dim = build_backbone(pretrained=True)
x = torch.randn(1, 3, 416, 416)
t0 = time.time()
for layer in model.children():
x = layer(x)
print(layer.__class__.__name__, 'output shape:', x.shape)
# y = model(x)
t1 = time.time()
print('Time: ', t1 - t0)
## 预测层
# 与YoloV1相比,YoloV2每个网格会预测5个框(VOC数据集),因此需×5
self.obj_pred = nn.Conv2d(head_dim, 1 * self.num_anchors, kernel_size=1)
self.cls_pred = nn.Conv2d(head_dim, num_classes * self.num_anchors, kernel_size=1)
self.reg_pred = nn.Conv2d(head_dim, 4 * self.num_anchors, kernel_size=1)
# RT-ODLab/models/detectors/yolov2/yolov2.py
import torch
import torch.nn as nn
import numpy as np
from utils.misc import multiclass_nms
from .yolov2_backbone import build_backbone
from .yolov2_neck import build_neck
from .yolov2_head import build_head
# YOLOv2
class YOLOv2(nn.Module):
def __init__(self,
cfg,
device,
num_classes=20,
conf_thresh=0.01,
nms_thresh=0.5,
topk=100,
trainable=False,
deploy=False,
nms_class_agnostic=False):
super(YOLOv2, self).__init__()
# ------------------- Basic parameters -------------------
self.cfg = cfg # 模型配置文件
self.device = device # cuda或者是cpu
self.num_classes = num_classes # 类别的数量
self.trainable = trainable # 训练的标记
self.conf_thresh = conf_thresh # 得分阈值
self.nms_thresh = nms_thresh # NMS阈值
self.topk = topk # topk
self.stride = 32 # 网络的最大步长
self.deploy = deploy
self.nms_class_agnostic = nms_class_agnostic
# ------------------- Anchor box -------------------
self.anchor_size = torch.as_tensor(cfg['anchor_size']).float().view(-1, 2) # [A, 2]
self.num_anchors = self.anchor_size.shape[0]
# ------------------- Network Structure -------------------
## 主干网络
self.backbone, feat_dim = build_backbone(
cfg['backbone'], trainable&cfg['pretrained'])
## 颈部网络
self.neck = build_neck(cfg, feat_dim, out_dim=512)
head_dim = self.neck.out_dim
## 检测头
self.head = build_head(cfg, head_dim, head_dim, num_classes)
## 预测层
# 与YoloV1相比,YoloV2每个网格会预测5个框(VOC数据集),因此需×5
self.obj_pred = nn.Conv2d(head_dim, 1 * self.num_anchors, kernel_size=1)
self.cls_pred = nn.Conv2d(head_dim, num_classes * self.num_anchors, kernel_size=1)
self.reg_pred = nn.Conv2d(head_dim, 4 * self.num_anchors, kernel_size=1)
if self.trainable:
self.init_bias()
def init_bias(self):
# init bias
init_prob = 0.01
bias_value = -torch.log(torch.tensor((1. - init_prob) / init_prob))
nn.init.constant_(self.obj_pred.bias, bias_value)
nn.init.constant_(self.cls_pred.bias, bias_value)
def generate_anchors(self, fmp_size):
pass
def decode_boxes(self, anchors, reg_pred):
pass
def postprocess(self, obj_pred, cls_pred, reg_pred, anchors):
"""
后处理代码,包括topk操作、阈值筛选和非极大值抑制
"""
pass
@torch.no_grad()
def inference(self, x):
bs = x.shape[0]
# 主干网络
feat = self.backbone(x)
# 颈部网络
feat = self.neck(feat)
# 检测头
cls_feat, reg_feat = self.head(feat)
# 预测层
obj_pred = self.obj_pred(reg_feat)
cls_pred = self.cls_pred(cls_feat)
reg_pred = self.reg_pred(reg_feat)
fmp_size = obj_pred.shape[-2:]
# anchors: [M, 2]
anchors = self.generate_anchors(fmp_size)
# 对 pred 的size做一些view调整,便于后续的处理
# [B, A*C, H, W] -> [B, H, W, A*C] -> [B, H*W*A, C]
obj_pred = obj_pred.permute(0, 2, 3, 1).contiguous().view(bs, -1, 1) # [1, 845=13×13×5, 1]
cls_pred = cls_pred.permute(0, 2, 3, 1).contiguous().view(bs, -1, self.num_classes)
reg_pred = reg_pred.permute(0, 2, 3, 1).contiguous().view(bs, -1, 4)
# 测试时,默认batch是1,
# 因此,我们不需要用batch这个维度,用[0]将其取走。
obj_pred = obj_pred[0] # [H*W*A, 1]
cls_pred = cls_pred[0] # [H*W*A, NC]
reg_pred = reg_pred[0] # [H*W*A, 4]
if self.deploy:
scores = torch.sqrt(obj_pred.sigmoid() * cls_pred.sigmoid())
bboxes = self.decode_boxes(anchors, reg_pred)
# [n_anchors_all, 4 + C]
outputs = torch.cat([bboxes, scores], dim=-1)
return outputs
else:
# post process
bboxes, scores, labels = self.postprocess(
obj_pred, cls_pred, reg_pred, anchors)
return bboxes, scores, labels
def forward(self, x):
if not self.trainable:
return self.inference(x)
else:
bs = x.shape[0]
# 主干网络
feat = self.backbone(x)
# 颈部网络
feat = self.neck(feat)
# 检测头
cls_feat, reg_feat = self.head(feat)
# 预测层
obj_pred = self.obj_pred(reg_feat)
cls_pred = self.cls_pred(cls_feat)
reg_pred = self.reg_pred(reg_feat)
fmp_size = obj_pred.shape[-2:]
# A就是Anchor的数量,VOC数据集上设置为5
# anchors: [M, 2], M = H*W*A
anchors = self.generate_anchors(fmp_size)
# 对 pred 的size做一些view调整,便于后续的处理
# [B, A*C, H, W] -> [B, H, W, A*C] -> [B, H*W*A, C]
obj_pred = obj_pred.permute(0, 2, 3, 1).contiguous().view(bs, -1, 1)
cls_pred = cls_pred.permute(0, 2, 3, 1).contiguous().view(bs, -1, self.num_classes)
reg_pred = reg_pred.permute(0, 2, 3, 1).contiguous().view(bs, -1, 4)
# decode bbox
box_pred = self.decode_boxes(anchors, reg_pred)
# 网络输出
outputs = {"pred_obj": obj_pred, # (Tensor) [B, M, 1]
"pred_cls": cls_pred, # (Tensor) [B, M, C]
"pred_box": box_pred, # (Tensor) [B, M, 4]
"stride": self.stride, # (Int)
"fmp_size": fmp_size # (List) [fmp_h, fmp_w]
}
return outputs
在1.5代码中,还遗留几个问题:
YOLOv2网络配置参数如下,我们从中能看到anchor_size变量。这是基于kmeans聚类,在COCO数据集上聚类出的先验框,由于COCO数据集更大、图片更加丰富,因此我们将这几个先验框用在VOC数据集上。
# RT-ODLab/config/model_config/yolov2_config.py
# YOLOv2 Config
yolov2_cfg = {
# input
'trans_type': 'ssd',
'multi_scale': [0.5, 1.5],
# model
'backbone': 'darknet19',
'pretrained': True,
'stride': 32, # P5
'max_stride': 32,
# neck
'neck': 'sppf',
'expand_ratio': 0.5,
'pooling_size': 5,
'neck_act': 'lrelu',
'neck_norm': 'BN',
'neck_depthwise': False,
# head
'head': 'decoupled_head',
'head_act': 'lrelu',
'head_norm': 'BN',
'num_cls_head': 2,
'num_reg_head': 2,
'head_depthwise': False,
'anchor_size': [[17, 25],
[55, 75],
[92, 206],
[202, 21],
[289, 311]], # 416
# matcher
'iou_thresh': 0.5,
# loss weight
'loss_obj_weight': 1.0,
'loss_cls_weight': 1.0,
'loss_box_weight': 5.0,
# training configuration
'trainer_type': 'yolov8',
}
回想一下,在之前实现的YOLOv1中,我们通过构造矩阵G,得到了每一个网格(grid_x,grid_y)的坐标。
由于我们在YOLOv2中引入了先验框,因此,我们不仅需要每一个网格(grid_x,grid_y)的坐标,还要包含先验框(5个)的尺寸信息。
先验框矩阵生成代码如下
# RT-ODLab/models/detectors/yolov2/yolov2.py
def generate_anchors(self, fmp_size):
"""
fmp_size: (List) [H, W]
默认缩放后的图像为416×416,那么经过32倍下采样后,fmp_size为13×13
"""
# 1、特征图的宽和高
fmp_w, fmp_h = fmp_size
# 2、生成网格的x坐标和y坐标
anchor_y, anchor_x = torch.meshgrid([torch.arange(fmp_h), torch.arange(fmp_w)])
# 3、将xy两部分的坐标拼接起来,shape为[H, W, 2]
# 再转换下, shape变为[HW, 2]
anchor_xy = torch.stack([anchor_x, anchor_y], dim=-1).float().view(-1, 2)
# 4、引入了anchor box机制,每个网格包含A个anchor,因此每个(grid_x, grid_y)的坐标需要复制A(Anchor nums)份
# 相当于 每个网格左上角的坐标点复制5份 作为5个不同宽高anchor box的中心点
# [HW, 2] -> [HW, A, 2] -> [M, 2]
anchor_xy = anchor_xy.unsqueeze(1).repeat(1, self.num_anchors, 1)
anchor_xy = anchor_xy.view(-1, 2).to(self.device)
# 5、将kmeans聚类得出的5组anchor box的宽高复制13×13份
# [A, 2] -> [1, A, 2] -> [HW, A, 2] -> [M, 2]
anchor_wh = self.anchor_size.unsqueeze(0).repeat(fmp_h*fmp_w, 1, 1)
anchor_wh = anchor_wh.view(-1, 2).to(self.device)
# 6、将中心点和宽高cat起来,得到的shape为[M, 4]
# 其中M=13×13×5 表示feature map为13×13,每个网格有5组anchor box
# 4代表anchor box的位置(x_center, y_center, w, h)
# 需要注意:
# x_center, y_center是feature map上的坐标位置,需要×stride 才能得到缩放后原始图像上的中心点
# w, h是针对缩放后原始图像
anchors = torch.cat([anchor_xy, anchor_wh], dim=-1)
return anchors
def decode_boxes(self, anchors, reg_pred):
"""
1、依据预测值reg_pred(t_x,t_y,t_w,t_h)结算出边界框中心点坐标c_x, c_y和宽高b_w, b_h
c_x = ( grid_x + sigmoid(t_x) ) × stride
c_y = ( grid_y + sigmoid(t_y) ) × stride
b_w = p_w × exp(t_w)
b_h = p_h × exp(t_h)
其中 grid_x,grid_y,p_w,p_h为先验框的结果,即anchors结果
2、转换为常用的x1y1x2y2形式。
注意:
预测的宽高不是相对于feature map的,而是相对于resize后图像大小,因此不需要×stride
"""
# 1、计算预测边界框的中心点坐标和宽高
pred_ctr = (anchors[..., :2] + torch.sigmoid(reg_pred[..., :2])) * self.stride
pred_wh = anchors[..., 2:] * torch.exp(reg_pred[..., 2:]) # 不需要×stride
# 2、将所有bbox的中心点坐标和宽高换算成x1y1x2y2形式
pred_x1y1 = pred_ctr - pred_wh * 0.5
pred_x2y2 = pred_ctr + pred_wh * 0.5
pred_box = torch.cat([pred_x1y1, pred_x2y2], dim=-1)
return pred_box
# RT-ODLab/models/detectors/yolov2/yolov2.py
def postprocess(self, obj_pred, cls_pred, reg_pred, anchors):
"""
后处理代码,包括topk操作、阈值筛选和非极大值抑制
1、topk操作:
在coco数据集中,检测对象的数量一半不会超过100,因此先选择得分最高的k个边界框,这里为了取得更高的mAP,取k=1000
在实际的场景中,不需要把k值取这么大
2、滤掉低得分(边界框的score低于给定的阈值)的预测边界框;
3、滤掉那些针对同一目标的冗余检测。
Input:
obj_pred: (Tensor) [H*W*A, 1]
cls_pred: (Tensor) [H*W*A, C]
reg_pred: (Tensor) [H*W*A, 4]
anchors: (Tensor) [H*W*A, 4]
其中,H*W*A = 13×13×5 = 845
"""
# (H x W x A x C,)
# 13×13×5×20 = 16900
scores = torch.sqrt(obj_pred.sigmoid() * cls_pred.sigmoid()).flatten()
# 1、topk操作
# Keep top k top scoring indices only.
num_topk = min(self.topk, reg_pred.size(0))
# torch.sort is actually faster than .topk (at least on GPUs)
predicted_prob, topk_idxs = scores.sort(descending=True)
topk_scores = predicted_prob[:num_topk]
topk_idxs = topk_idxs[:num_topk]
# 2、滤掉低得分(边界框的score低于给定的阈值)的预测边界框
# filter out the proposals with low confidence score
keep_idxs = topk_scores > self.conf_thresh
scores = topk_scores[keep_idxs]
topk_idxs = topk_idxs[keep_idxs]
# 获取flatten之前topk_scores所在的idx以及相应的label
anchor_idxs = torch.div(topk_idxs, self.num_classes, rounding_mode='floor') # 获取
labels = topk_idxs % self.num_classes
reg_pred = reg_pred[anchor_idxs]
anchors = anchors[anchor_idxs]
# 解算边界框, 并归一化边界框: [H*W*A, 4]
bboxes = self.decode_boxes(anchors, reg_pred)
# to cpu & numpy
scores = scores.cpu().numpy()
labels = labels.cpu().numpy()
bboxes = bboxes.cpu().numpy()
# 3、滤掉那些针对同一目标的冗余检测。
# nms
scores, labels, bboxes = multiclass_nms(
scores, labels, bboxes, self.nms_thresh, self.num_classes, self.nms_class_agnostic)
return bboxes, scores, labels
接下来,就到了正样本的匹配和损失函数计算了。
计算边界框和目标框的IoU,只有IoU最大的才被标记为正样本
,用来计算置信度损失、类别损失以及边界框位置损失,其他预测的边界框均为负样本,仅仅计算置信度损失。即基于先验框的正样本匹配策略
。