????????以下内容仅为个人在学习人工智能中所记录的笔记,先将目标识别算法yolo系列的整理出来分享给大家,供大家学习参考。
????????本文仅对YOLOV3代码中关键部分进行了注释,未掌握基础代码的铁汁可以自己百度一下。
????????若文中内容有误,希望大家批评指正。
资料下载
????????YOLOV3论文下载地址:YOLOv3:An Incremental Improvement
回顾
????????YOLO V1:【YOLO系列】YOLO V1论文思想详解
????????YOLO V2:【YOLO系列】YOLO V2论文思想详解
????????YOLO V3:【YOLO系列】 YOLOv3论文思想详解
项目地址
????????YOLOV3 keras版本:下载地址
????????YOLOV3 Tensorflow版本:下载地址
????????YOLOV3 Pytorch版本:下载地址
Gitee仓库
????????YOLOV3 各版本:yolov3各版本
YOLO V3代码详解
????????YOLO V3代码详解(一):【YOLO系列】YOLOv3代码详解(一):主脚本yolo_video.py
????????YOLO V3代码详解(二):【YOLO系列】YOLOv3代码详解(二):检测脚本yolo.py
????????YOLO V3代码详解(三):【YOLO系列】YOLOv3代码详解(三):训练脚本train.py
????????本文主要基于keras版本进行讲解
????????话不多说,直接上代码
@wraps(Conv2D)
def DarknetConv2D(*args, **kwargs):
"""Wrapper to set Darknet parameters for Convolution2D."""
# 定义一个darknet_conv_kwargs字典,传递“kernel_regularizer”、“padding”参数
darknet_conv_kwargs = {'kernel_regularizer': l2(5e-4)}
# 如果输入的kwargs中定义了strides为(2,2),则padding模式为valid,否则为same模式
darknet_conv_kwargs['padding'] = 'valid' if kwargs.get('strides') == (2, 2) else 'same'
# 将输入的kwargs值更新到darknet_conv_kwargs字典中
darknet_conv_kwargs.update(kwargs)
return Conv2D(*args, **darknet_conv_kwargs)
def DarknetConv2D_BN_Leaky(*args, **kwargs):
"""Darknet Convolution2D followed by BatchNormalization and LeakyReLU."""
# 定义一个no_bias_kwargs字典
no_bias_kwargs = {'use_bias': False}
# 将传递里面的kwargs值更新到no_bias_kwargs字典中
no_bias_kwargs.update(kwargs)
# 返回一个组合函数,由DarknetConv2D、BN、LeakyRelu组成,LeakyRelu的alpha值为0.1,这意味着当输入值小于 0 时,输出为 0.1 倍的输入值;当输入值大于等于 0 时,输出为输入值本身。
return compose(
# 定义一个Conv2D层
DarknetConv2D(*args, **no_bias_kwargs),
BatchNormalization(),
LeakyReLU(alpha=0.1))
def resblock_body(x, num_filters, num_blocks):
"""A series of resblocks starting with a downsampling Convolution2D"""
# Darknet uses left and top padding instead of 'same' mode
# 进行零填充
# 第一个元组(1, 0)指定了垂直方向(或高度方向)的填充。1表示在顶部填充1行零,0表示在底部不填充。
# 第二个元组(1, 0)指定了水平方向(或宽度方向)的填充。1表示在左侧填充1列零,0表示在右侧不填充。
x = ZeroPadding2D(((1, 0), (1, 0)))(x)
# 创建一个DarknetConv2D_BN_Leaky卷积层,其中包括卷积层(filters=num_filters, kernel_size=(3, 3),strides=(2, 2),padding="same")、归一化层BN、激活函数层LeakyRule
# 这里strides=(2, 2),代替了池化的作用
x = DarknetConv2D_BN_Leaky(num_filters, (3, 3), strides=(2, 2))(x)
# 残差结构
for i in range(num_blocks):
y = compose(
DarknetConv2D_BN_Leaky(num_filters // 2, (1, 1)),
DarknetConv2D_BN_Leaky(num_filters, (3, 3)))(x)
x = Add()([x, y])
return x
def darknet_body(x):
'''Darknent body having 52 Convolution2D layers'''
# 创建一个DarknetConv2D_BN_Leaky卷积层,其中包括卷积层(filters=32, kernel_size=(3, 3),strides=(1, 1),padding="same")、归一化层BN、激活函数层LeakyRule
x = DarknetConv2D_BN_Leaky(32, (3, 3))(x)
# 残差结构,(输入,filter数量, 残差block数量)
x = resblock_body(x, 64, 1)
x = resblock_body(x, 128, 2)
x = resblock_body(x, 256, 8)
x = resblock_body(x, 512, 8)
x = resblock_body(x, 1024, 4)
return x
def make_last_layers(x, num_filters, out_filters):
'''6 Conv2D_BN_Leaky layers followed by a Conv2D_linear layer'''
x = compose(
DarknetConv2D_BN_Leaky(num_filters, (1, 1)),
DarknetConv2D_BN_Leaky(num_filters * 2, (3, 3)),
DarknetConv2D_BN_Leaky(num_filters, (1, 1)),
DarknetConv2D_BN_Leaky(num_filters * 2, (3, 3)),
DarknetConv2D_BN_Leaky(num_filters, (1, 1)))(x)
y = compose(
DarknetConv2D_BN_Leaky(num_filters * 2, (3, 3)),
DarknetConv2D(out_filters, (1, 1)))(x)
return x, y
def yolo_body(inputs, num_anchors, num_classes):
"""Create YOLO_V3 model CNN body in Keras."""
darknet = Model(inputs, darknet_body(inputs))
# 输出三个特征图
# 输出层的最后计算,包括6个Conv2D_BN_Leaky层和1个Conv2D_linear层
x, y1 = make_last_layers(darknet.output, 512, num_anchors * (num_classes + 5))
# 最后一层输出层进行Conv2D_BN_Leaky层与上采样操作后,与第152层的输出拼接
x = compose(
DarknetConv2D_BN_Leaky(256, (1, 1)),
UpSampling2D(2))(x)
x = Concatenate()([x, darknet.layers[152].output])
x, y2 = make_last_layers(x, 256, num_anchors * (num_classes + 5))
# 倒数第二层输出层进行Conv2D_BN_Leaky层与上采样操作后,与第92层的输出拼接
x = compose(
DarknetConv2D_BN_Leaky(128, (1, 1)),
UpSampling2D(2))(x)
x = Concatenate()([x, darknet.layers[92].output])
x, y3 = make_last_layers(x, 128, num_anchors * (num_classes + 5))
return Model(inputs, [y1, y2, y3])
def tiny_yolo_body(inputs, num_anchors, num_classes):
"""在keras架构上创建一个tiny YOLOV3模型,
由8个CNN层+6个池化层+上采样层(CNN+upsampling)+2个输出(2个CNN+2个Conv)构成,总共20层"""
# 生成一个卷积组合x1,输入为inputs,由5个DarknetConv2D_BN_Leaky与4个池化层构成
x1 = compose(
# 创建一个DarknetConv2D_BN_Leaky卷积层,其中包括卷积层(filters=16, kernel_size=(3, 3),strides=(1, 1),padding="same")、归一化层BN、激活函数层LeakyRule
DarknetConv2D_BN_Leaky(16, (3, 3)),
# 池化层,池化框尺寸为(2,2),步长为(2,2),表示特征图缩小4倍,即宽和高各缩小2倍;padding模式为same,'same'表示在输入特征图的边缘填充0,使得经过池化后,输出特征图的大小与输入特征图一致
MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='same'),
DarknetConv2D_BN_Leaky(32, (3, 3)),
MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='same'),
DarknetConv2D_BN_Leaky(64, (3, 3)),
MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='same'),
DarknetConv2D_BN_Leaky(128, (3, 3)),
MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='same'),
DarknetConv2D_BN_Leaky(256, (3, 3)))(inputs)
# 生成一个卷积组合x2,输入为x1,由3个DarknetConv2D_BN_Leaky与2个池化层构成
x2 = compose(
MaxPooling2D(pool_size=(2, 2), strides=(2, 2), padding='same'),
DarknetConv2D_BN_Leaky(512, (3, 3)),
MaxPooling2D(pool_size=(2, 2), strides=(1, 1), padding='same'),
DarknetConv2D_BN_Leaky(1024, (3, 3)),
DarknetConv2D_BN_Leaky(256, (1, 1)))(x1)
# 生成一个预测层,输入为x2,由1个DarknetConv2D_BN_Leaky与1个卷积层构成,输出一个N*N*Anchor个数*(类别数量+5)的tensor
y1 = compose(
DarknetConv2D_BN_Leaky(512, (3, 3)),
DarknetConv2D(num_anchors * (num_classes + 5), (1, 1)))(x2)
# 生成一个卷积组合x2,输入为x2,由1个DarknetConv2D_BN_Leaky与1个上采样层构成
x2 = compose(
DarknetConv2D_BN_Leaky(128, (1, 1)),
UpSampling2D(2))(x2)
# 将经过上采样的x2与x1拼接在一起,再1个DarknetConv2D_BN_Leaky层与1个卷积层,输出一个N*N*Anchor个数*(类别数量+5)的tensor
y2 = compose(
Concatenate(),
DarknetConv2D_BN_Leaky(256, (3, 3)),
DarknetConv2D(num_anchors * (num_classes + 5), (1, 1)))([x2, x1])
return Model(inputs, [y1, y2])
def yolo_head(feats, anchors, num_classes, input_shape, calc_loss=False):
"""
Convert final layer features to bounding box parameters.
预测box的坐标,置信度与分类
"""
num_anchors = len(anchors)
# 生成一个tensor,形状为(batch, height, width, num_anchors, box_params).
anchors_tensor = K.reshape(K.constant(anchors), [1, 1, 1, num_anchors, 2])
# 获取输出层的height, width的维度
grid_shape = K.shape(feats)[1:3]
# 绘制x、y坐标,y-height, x-width
# K.arange(0, stop=grid_shape[0]) 表示生成一个0-(grid_shape[0]-1)的张量
grid_y = K.tile(K.reshape(K.arange(0, stop=grid_shape[0]), [-1, 1, 1, 1]),
[1, grid_shape[1], 1, 1])
grid_x = K.tile(K.reshape(K.arange(0, stop=grid_shape[1]), [1, -1, 1, 1]),
[grid_shape[0], 1, 1, 1])
grid = K.concatenate([grid_x, grid_y])
grid = K.cast(grid, K.dtype(feats))
feats = K.reshape(
feats, [-1, grid_shape[0], grid_shape[1], num_anchors, 5 + num_classes])
# 这一步对应论文中Bounding box Prediction.同时做了归一化
box_xy = (K.sigmoid(feats[..., :2]) + grid) / K.cast(grid_shape[::-1], K.dtype(feats))
box_wh = K.exp(feats[..., 2:4]) * anchors_tensor / K.cast(input_shape[::-1], K.dtype(feats))
# 获取置信度值与分类值
box_confidence = K.sigmoid(feats[..., 4:5])
box_class_probs = K.sigmoid(feats[..., 5:])
# 计算坐标损失
if calc_loss == True:
return grid, feats, box_xy, box_wh
return box_xy, box_wh, box_confidence, box_class_probs
def yolo_correct_boxes(box_xy, box_wh, input_shape, image_shape):
"""
Get corrected boxes
修正box的坐标
将得到的特征图与原图相比,求出偏移量,修正box的坐标
"""
box_yx = box_xy[..., ::-1]
box_hw = box_wh[..., ::-1]
input_shape = K.cast(input_shape, K.dtype(box_yx))
image_shape = K.cast(image_shape, K.dtype(box_yx))
# 新生成一个以(input_shape / image_shape)中最小比例的尺寸图片
new_shape = K.round(image_shape * K.min(input_shape / image_shape))
# 计算新生成的最小比例的图片与放大后的特征图的相对偏移量
offset = (input_shape - new_shape) / 2. / input_shape
# 计算放大后的特征图与新生成的最小比例的图片的比例
scale = input_shape / new_shape
# 修正box的坐标
box_yx = (box_yx - offset) * scale
box_hw *= scale
box_mins = box_yx - (box_hw / 2.)
box_maxes = box_yx + (box_hw / 2.)
boxes = K.concatenate([
box_mins[..., 0:1], # y_min
box_mins[..., 1:2], # x_min
box_maxes[..., 0:1], # y_max
box_maxes[..., 1:2] # x_max
])
# Scale boxes back to original image shape.
# 反归一化,求得box在输入图片的实际坐标值
boxes *= K.concatenate([image_shape, image_shape])
return boxes
def yolo_boxes_and_scores(feats, anchors, num_classes, input_shape, image_shape):
"""
Process Conv layer output
feats: 输出层,shape=(m,N,N,3,5+80)
anchors: 输出层对应的Anchor
num_classes:类别的数量
input_shape: 特征图放大32倍的尺寸
image_shape:输入图片的大小
"""
# 预测box的坐标(x, y, w, h),置信度与分类
box_xy, box_wh, box_confidence, box_class_probs = yolo_head(feats,
anchors, num_classes, input_shape)
# 修正每个特征图中box的坐标
boxes = yolo_correct_boxes(box_xy, box_wh, input_shape, image_shape)
boxes = K.reshape(boxes, [-1, 4])
# 计算每个box的置信度
box_scores = box_confidence * box_class_probs
box_scores = K.reshape(box_scores, [-1, num_classes])
return boxes, box_scores
def yolo_eval(yolo_outputs,
anchors,
num_classes,
image_shape,
max_boxes=20,
score_threshold=.6,
iou_threshold=.5):
"""评估函数
Evaluate YOLO model on given input and return filtered boxes.
yolo_outputs:输出层,shape=(m,N,N,3,5+80)
anchors:Anchor Box
num_classes:类别的数量
image_shape:输入图像的尺寸
max_boxes:box的最大数量
score_threshold:预测分数的阈值
iou_threshold:IOU的阈值
"""
# 将Anchor Box与输出层对应
num_layers = len(yolo_outputs)
anchor_mask = [[6, 7, 8], [3, 4, 5], [0, 1, 2]] if num_layers == 3 else [[3, 4, 5], [1, 2, 3]] # default setting
# 将特征图尺寸放大32倍
input_shape = K.shape(yolo_outputs[0])[1:3] * 32
boxes = []
box_scores = []
for l in range(num_layers):
# 计算输出的box与分值
_boxes, _box_scores = yolo_boxes_and_scores(yolo_outputs[l],
anchors[anchor_mask[l]], num_classes, input_shape, image_shape)
boxes.append(_boxes)
box_scores.append(_box_scores)
boxes = K.concatenate(boxes, axis=0)
box_scores = K.concatenate(box_scores, axis=0)
# 筛选出分值大于阈值的
mask = box_scores >= score_threshold
max_boxes_tensor = K.constant(max_boxes, dtype='int32')
boxes_ = []
scores_ = []
classes_ = []
for c in range(num_classes):
# TODO: use keras backend instead of tf.
# 将box_scores >= score_threshold的box,box score取出来
class_boxes = tf.boolean_mask(boxes, mask[:, c])
class_box_scores = tf.boolean_mask(box_scores[:, c], mask[:, c])
# 非极大值抑制,去除IOU>iou_threshold的框
nms_index = tf.image.non_max_suppression(
class_boxes, class_box_scores, max_boxes_tensor, iou_threshold=iou_threshold)
# 将剩下的class_boxes、class_box_scores、class取出来
class_boxes = K.gather(class_boxes, nms_index)
class_box_scores = K.gather(class_box_scores, nms_index)
classes = K.ones_like(class_box_scores, 'int32') * c
boxes_.append(class_boxes)
scores_.append(class_box_scores)
classes_.append(classes)
boxes_ = K.concatenate(boxes_, axis=0)
scores_ = K.concatenate(scores_, axis=0)
classes_ = K.concatenate(classes_, axis=0)
return boxes_, scores_, classes_
def preprocess_true_boxes(true_boxes, input_shape, anchors, num_classes):
'''Preprocess true boxes to training input format
Parameters
----------
true_boxes: array, shape=(m, T, 5)
Absolute x_min, y_min, x_max, y_max, class_id relative to input_shape.
input_shape: array-like, hw, multiples of 32
anchors: array, shape=(N, 2), wh
num_classes: integer
Returns
-------
y_true: list of array, shape like yolo_outputs, xywh are reletive value
'''
# 首先判断GT框中的class_id是否超过了类别的总数
assert (true_boxes[..., 4] < num_classes).all(), 'class id must be less than num_classes'
# 判断Anchor Box是否能分为3组,并指定每一组中Anchor Box的索引值
# 这里对应原文中 作者选择了9种不同Anchor Box来对3种不同的尺度进行预测
# 特征图较大的用较小的Anchor([0, 1, 2])去预测,特征图较小的用较大的Anchor([6, 7, 8])去预测
num_layers = len(anchors) // 3 # default setting
anchor_mask = [[6, 7, 8], [3, 4, 5], [0, 1, 2]] if num_layers == 3 else [[3, 4, 5], [1, 2, 3]]
true_boxes = np.array(true_boxes, dtype='float32')
input_shape = np.array(input_shape, dtype='int32')
# 计算GT框的中心点左边与宽、高,boxes_xy.shape=(m, T, 2)=boxes_wh.shape
boxes_xy = (true_boxes[..., 0:2] + true_boxes[..., 2:4]) // 2
boxes_wh = true_boxes[..., 2:4] - true_boxes[..., 0:2]
true_boxes[..., 0:2] = boxes_xy / input_shape[::-1]
true_boxes[..., 2:4] = boxes_wh / input_shape[::-1]
m = true_boxes.shape[0]
# 生成倒数三层输出层的特征图大小(13,13),(26,26),(52,52)
grid_shapes = [input_shape // {0: 32, 1: 16, 2: 8}[l] for l in range(num_layers)]
# 创建倒数三层输出层的y_true零数组(m,13,13,3,5+80),(m,26,26,3,5+80),(m,52,52,3,5+80)
y_true = [np.zeros((m, grid_shapes[l][0], grid_shapes[l][1], len(anchor_mask[l]), 5 + num_classes),
dtype='float32') for l in range(num_layers)]
# Expand dim to apply broadcasting.
# 在anchor box数组中增加一维,shape=(1, N, 2)
anchors = np.expand_dims(anchors, 0)
anchor_maxes = anchors / 2.
anchor_mins = -anchor_maxes
# 要求所有维度的第一维元素要>0,返回的数组为(1,n)的bool值
valid_mask = boxes_wh[..., 0] > 0
for b in range(m):
# Discard zero rows.
# 判断wh是否为0,若为0则跳过该轮循环, wh.shape=(1, 2)
wh = boxes_wh[b, valid_mask[b]]
if len(wh) == 0: continue
# Expand dim to apply broadcasting.
# 在倒数第二维增加一维,wh.shape=(1, 1, 2)
wh = np.expand_dims(wh, -2)
box_maxes = wh / 2.
box_mins = -box_maxes
intersect_mins = np.maximum(box_mins, anchor_mins)
intersect_maxes = np.minimum(box_maxes, anchor_maxes)
intersect_wh = np.maximum(intersect_maxes - intersect_mins, 0.)
intersect_area = intersect_wh[..., 0] * intersect_wh[..., 1]
box_area = wh[..., 0] * wh[..., 1]
anchor_area = anchors[..., 0] * anchors[..., 1]
iou = intersect_area / (box_area + anchor_area - intersect_area)
# Find best anchor for each true box
# 获取与GT IOU最大的anchor box,记为best anchor
best_anchor = np.argmax(iou, axis=-1)
# 将这个IOU最大的anchor box对应的GT的y_true记为1
for t, n in enumerate(best_anchor):
for l in range(num_layers):
if n in anchor_mask[l]:
i = np.floor(true_boxes[b, t, 0] * grid_shapes[l][1]).astype('int32')
j = np.floor(true_boxes[b, t, 1] * grid_shapes[l][0]).astype('int32')
k = anchor_mask[l].index(n)
c = true_boxes[b, t, 4].astype('int32')
y_true[l][b, j, i, k, 0:4] = true_boxes[b, t, 0:4]
y_true[l][b, j, i, k, 4] = 1
y_true[l][b, j, i, k, 5 + c] = 1
return y_true
def box_iou(b1, b2):
'''Return iou tensor
Parameters
----------
b1: tensor, shape=(i1,...,iN, 4), xywh
b2: tensor, shape=(j, 4), xywh
Returns
-------
iou: tensor, shape=(i1,...,iN, j)
'''
# Expand dim to apply broadcasting.
b1 = K.expand_dims(b1, -2)
b1_xy = b1[..., :2]
b1_wh = b1[..., 2:4]
b1_wh_half = b1_wh / 2.
b1_mins = b1_xy - b1_wh_half
b1_maxes = b1_xy + b1_wh_half
# Expand dim to apply broadcasting.
b2 = K.expand_dims(b2, 0)
b2_xy = b2[..., :2]
b2_wh = b2[..., 2:4]
b2_wh_half = b2_wh / 2.
b2_mins = b2_xy - b2_wh_half
b2_maxes = b2_xy + b2_wh_half
intersect_mins = K.maximum(b1_mins, b2_mins)
intersect_maxes = K.minimum(b1_maxes, b2_maxes)
intersect_wh = K.maximum(intersect_maxes - intersect_mins, 0.)
intersect_area = intersect_wh[..., 0] * intersect_wh[..., 1]
b1_area = b1_wh[..., 0] * b1_wh[..., 1]
b2_area = b2_wh[..., 0] * b2_wh[..., 1]
iou = intersect_area / (b1_area + b2_area - intersect_area)
return iou
def yolo_loss(args, anchors, num_classes, ignore_thresh=.5, print_loss=False):
"""Return yolo_loss tensor
Parameters
----------
yolo_outputs: list of tensor, the output of yolo_body or tiny_yolo_body
y_true: list of array, the output of preprocess_true_boxes
anchors: array, shape=(N, 2), wh
num_classes: integer
ignore_thresh: float, the iou threshold whether to ignore object confidence loss
Returns
-------
loss: tensor, shape=(1,)
"""
# 这个默认将Anchor Box分为3组
num_layers = len(anchors) // 3 # default setting
# 将前num_layers层(不含num_layers层)定义为输出层,yolo_outputs中输出的张量为(batch_size, height, width, channels)
yolo_outputs = args[:num_layers]
# 将后num_layers层定义为y_true层
y_true = args[num_layers:]
# 判断Anchor Box是否能分为3组,并指定每一组中Anchor Box的索引值
# 这里对应原文中 作者选择了9种不同Anchor Box来对3种不同的尺度进行预测
# 特征图较大的用较小的Anchor([0, 1, 2])去预测,特征图较小的用较大的Anchor([6, 7, 8])去预测
anchor_mask = [[6, 7, 8], [3, 4, 5], [0, 1, 2]] if num_layers == 3 else [[3, 4, 5], [1, 2, 3]]
# K.cast()函数用于将一个值从一个类型转换为另一个类型
# K.shape(yolo_outputs[0])[1:3]表示获取yolo_outputs[0]的第二维与第三维的形状,即(height, width)的形状
# 然后再将(height, width)的形状放大32倍
input_shape = K.cast(K.shape(yolo_outputs[0])[1:3] * 32, K.dtype(y_true[0]))
grid_shapes = [K.cast(K.shape(yolo_outputs[l])[1:3], K.dtype(y_true[0])) for l in range(num_layers)]
loss = 0
# 获取batch size
m = K.shape(yolo_outputs[0])[0]
mf = K.cast(m, K.dtype(yolo_outputs[0]))
for l in range(num_layers):
# [...]: 用于表示多个冒号,通常用于多维数组的索引,这里代表取第5维:是否为物体
object_mask = y_true[l][..., 4:5]
# 获取物体正确的分类
true_class_probs = y_true[l][..., 5:]
# 计算图像的每个像素点坐标grid,输出层raw_pred,shape=(m, N, N, 3, 5+80),预测box的坐标(x, y, w, h),
grid, raw_pred, pred_xy, pred_wh = yolo_head(yolo_outputs[l],
anchors[anchor_mask[l]], num_classes, input_shape, calc_loss=True)
pred_box = K.concatenate([pred_xy, pred_wh])
# Darknet raw box to calculate loss.
# 计算坐标xy偏差
raw_true_xy = y_true[l][..., :2] * grid_shapes[l][::-1] - grid
# 计算wh的偏移量
raw_true_wh = K.log(y_true[l][..., 2:4] / anchors[anchor_mask[l]] * input_shape[::-1])
# 当object_mask是物体的时候,返回raw_true_wh,不是物体返回K.zeros_like(raw_true_wh)=0数组
raw_true_wh = K.switch(object_mask, raw_true_wh, K.zeros_like(raw_true_wh)) # avoid log(0)=-inf
box_loss_scale = 2 - y_true[l][..., 2:3] * y_true[l][..., 3:4]
# Find ignore mask, iterate over each of batch.
# 创建一个与y_true[0] 相同数据类型的动态数组,初始大小为 1。
ignore_mask = tf.TensorArray(K.dtype(y_true[0]), size=1, dynamic_size=True)
# 将object_mask数据类型转为bool
object_mask_bool = K.cast(object_mask, 'bool')
# 计算某个bbox与ground truth的重合度是否超过某个阈值,超过则不计入损失计算
def loop_body(b, ignore_mask):
# 取出是物体的box坐标
true_box = tf.boolean_mask(y_true[l][b, ..., 0:4], object_mask_bool[b, ..., 0])
# 计算预测框与GT框的IOU值
iou = box_iou(pred_box[b], true_box)
# 输出best_iou < ignore_thresh判断的0,1值
best_iou = K.max(iou, axis=-1)
ignore_mask = ignore_mask.write(b, K.cast(best_iou < ignore_thresh, K.dtype(true_box)))
return b + 1, ignore_mask
_, ignore_mask = K.control_flow_ops.while_loop(lambda b, *args: b < m, loop_body, [0, ignore_mask])
ignore_mask = ignore_mask.stack()
ignore_mask = K.expand_dims(ignore_mask, -1)
# K.binary_crossentropy is helpful to avoid exp overflow.
# 坐标损失、置信度损失、分类损失计算
xy_loss = object_mask * box_loss_scale * K.binary_crossentropy(raw_true_xy, raw_pred[..., 0:2], from_logits=True)
wh_loss = object_mask * box_loss_scale * 0.5 * K.square(raw_true_wh - raw_pred[..., 2:4])
confidence_loss = object_mask * K.binary_crossentropy(object_mask, raw_pred[..., 4:5], from_logits=True) + (1 - object_mask) * K.binary_crossentropy(object_mask, raw_pred[..., 4:5], from_logits=True) * ignore_mask
class_loss = object_mask * K.binary_crossentropy(true_class_probs, raw_pred[..., 5:], from_logits=True)
xy_loss = K.sum(xy_loss) / mf
wh_loss = K.sum(wh_loss) / mf
confidence_loss = K.sum(confidence_loss) / mf
class_loss = K.sum(class_loss) / mf
loss += xy_loss + wh_loss + confidence_loss + class_loss
if print_loss:
loss = tf.Print(loss, [loss, xy_loss, wh_loss, confidence_loss, class_loss, K.sum(ignore_mask)], message='loss: ')
return loss