代码路径:mmdet/datasets/custom.py
mmdet中的CustomDataset继承自torch的Dataset,因此,对应的需要实现3个虚函数,接下来我们首先看看这3个重要函数的实现。
CLASSES = None # static变量
def __init__(self,
ann_file, # 标签文件路径
pipeline, # 数据处理pipeline
classes=None, # 数据集类别
data_root=None, # 数据集主路径
img_prefix='', # 图像前缀
seg_prefix=None, #
proposal_file=None,
test_mode=False, # 测试模式,该模式下标签文件不会被夹在
filter_empty_gt=True): # 是否过滤掉没有目标的背景图像!!!!
self.ann_file = ann_file
self.data_root = data_root
self.img_prefix = img_prefix
self.seg_prefix = seg_prefix
self.proposal_file = proposal_file
self.test_mode = test_mode
self.filter_empty_gt = filter_empty_gt
# 获取数据集的类别, 如果传入的classes为空, 则使用CLASSES
# classes不为空
# 1. 传入文本文件
# 2. 列表
self.CLASSES = self.get_classes(classes)
# join paths if data_root is specified
# 拼凑文件路径
if self.data_root is not None:
if not osp.isabs(self.ann_file):
self.ann_file = osp.join(self.data_root, self.ann_file)
if not (self.img_prefix is None or osp.isabs(self.img_prefix)):
self.img_prefix = osp.join(self.data_root, self.img_prefix)
if not (self.seg_prefix is None or osp.isabs(self.seg_prefix)):
self.seg_prefix = osp.join(self.data_root, self.seg_prefix)
if not (self.proposal_file is None
or osp.isabs(self.proposal_file)):
self.proposal_file = osp.join(self.data_root,
self.proposal_file)
# load annotations (and proposals)
# 调用load_annotations函数加载标签内容
self.data_infos = self.load_annotations(self.ann_file)
if self.proposal_file is not None:
self.proposals = self.load_proposals(self.proposal_file)
else:
self.proposals = None
# filter images too small and containing no annotations
# _filter_imgs函数中过滤掉尺寸比较小的图像,以及不包含目标的背景图像
if not test_mode:
valid_inds = self._filter_imgs()
self.data_infos = [self.data_infos[i] for i in valid_inds]
if self.proposals is not None:
self.proposals = [self.proposals[i] for i in valid_inds]
# set group flag for the sampler
# 根据图像的宽高比,将图像分配到不同组中,用于后续dataloader的数据采样
# 宽高比小于1的分配到组0,宽高比大于1的分配到组1
self._set_group_flag()
# processing pipeline
# 数据集处理pipeline构建
self.pipeline = Compose(pipeline)
def __getitem__(self, idx):
# 根据idx获取到对应的图像和标签数据
if self.test_mode:
return self.prepare_test_img(idx)
while True:
data = self.prepare_train_img(idx)
if data is None: # 如果数据为空,则重新随机采样
idx = self._rand_another(idx)
continue
return data
返回数据集的大小
def __len__(self):
"""Total number of samples of data."""
return len(self.data_infos)
CocoDataset类继承自CustomDataset,分别实现了如下的方法:
def load_annotations(self, ann_file):
self.coco = COCO(ann_file) # coco对象
self.cat_ids = self.coco.get_cat_ids(cat_names=self.CLASSES) # 获取数据集类别ID
self.cat2label = {cat_id: i for i, cat_id in enumerate(self.cat_ids)} # 类别ID和标签之间的映射
self.img_ids = self.coco.get_img_ids() # 获取到数据集全部image ID
data_infos = []
for i in self.img_ids:
info = self.coco.load_imgs([i])[0] # 根据image ID获取到images字段内容
info['filename'] = info['file_name']
data_infos.append(info)
return data_infos
def get_ann_info(self, idx):
"""Get COCO annotation by index.
Args:
idx (int): Index of data.
Returns:
dict: Annotation info of specified index.
"""
# 根据idx获取得到image ID
img_id = self.data_infos[idx]['id']
# 根据image ID找到对应的标签ID
ann_ids = self.coco.get_ann_ids(img_ids=[img_id])
# 根据标签ID获取到对应的annotations字段内容
ann_info = self.coco.load_anns(ann_ids)
# 对annotations字段内容进行解析
return self._parse_ann_info(self.data_infos[idx], ann_info)
def _filter_imgs(self, min_size=32):
"""Filter images too small or without ground truths."""
valid_inds = []
# obtain images that contain annotation
ids_with_ann = set(_['image_id'] for _ in self.coco.anns.values())
# obtain images that contain annotations of the required categories
ids_in_cat = set()
for i, class_id in enumerate(self.cat_ids):
ids_in_cat |= set(self.coco.cat_img_map[class_id])
# merge the image id sets of the two conditions and use the merged set
# to filter out images if self.filter_empty_gt=True
ids_in_cat &= ids_with_ann
valid_img_ids = []
for i, img_info in enumerate(self.data_infos):
img_id = self.img_ids[i]
# 背景图像过滤
if self.filter_empty_gt and img_id not in ids_in_cat:
continue
# 图像尺寸最小边过滤
if min(img_info['width'], img_info['height']) >= min_size:
valid_inds.append(i)
valid_img_ids.append(img_id)
self.img_ids = valid_img_ids
return valid_inds
def _parse_ann_info(self, img_info, ann_info):
"""Parse bbox and mask annotation.
Args:
ann_info (list[dict]): Annotation info of an image.
with_mask (bool): Whether to parse mask annotations.
Returns:
dict: A dict containing the following keys: bboxes, bboxes_ignore,\
labels, masks, seg_map. "masks" are raw annotations and not \
decoded into binary masks.
"""
gt_bboxes = []
gt_labels = []
gt_bboxes_ignore = []
gt_masks_ann = []
for i, ann in enumerate(ann_info):
if ann.get('ignore', False):
continue
x1, y1, w, h = ann['bbox'] # 获取目标框信息
# 判断边界条件
inter_w = max(0, min(x1 + w, img_info['width']) - max(x1, 0))
inter_h = max(0, min(y1 + h, img_info['height']) - max(y1, 0))
# 目标框宽高存在0
if inter_w * inter_h == 0:
continue
# 目标框面积判断,宽高判断
if ann['area'] <= 0 or w < 1 or h < 1:
continue
# 类别ID判断
if ann['category_id'] not in self.cat_ids:
continue
bbox = [x1, y1, x1 + w, y1 + h]
if ann.get('iscrowd', False):
gt_bboxes_ignore.append(bbox)
else:
gt_bboxes.append(bbox)
# 获取到对应的类别字符串
gt_labels.append(self.cat2label[ann['category_id']])
gt_masks_ann.append(ann.get('segmentation', None))
if gt_bboxes:
gt_bboxes = np.array(gt_bboxes, dtype=np.float32)
gt_labels = np.array(gt_labels, dtype=np.int64)
else:
gt_bboxes = np.zeros((0, 4), dtype=np.float32)
gt_labels = np.array([], dtype=np.int64)
if gt_bboxes_ignore:
gt_bboxes_ignore = np.array(gt_bboxes_ignore, dtype=np.float32)
else:
gt_bboxes_ignore = np.zeros((0, 4), dtype=np.float32)
seg_map = img_info['filename'].replace('jpg', 'png')
ann = dict(
bboxes=gt_bboxes,
labels=gt_labels,
bboxes_ignore=gt_bboxes_ignore,
masks=gt_masks_ann,
seg_map=seg_map)
return ann
首先,我们看一下coco数据集对应的config文件内容,具体如下:
dataset_type = 'CocoDataset'
data_root = 'data/coco/'
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='LoadAnnotations', with_bbox=True),
dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
dict(type='RandomFlip', flip_ratio=0.5),
dict(type='Normalize', **img_norm_cfg),
dict(type='Pad', size_divisor=32),
dict(type='DefaultFormatBundle'),
dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
]
test_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='MultiScaleFlipAug',
img_scale=(1333, 800),
flip=False,
transforms=[
dict(type='Resize', keep_ratio=True),
dict(type='RandomFlip'),
dict(type='Normalize', **img_norm_cfg),
dict(type='Pad', size_divisor=32),
dict(type='ImageToTensor', keys=['img']),
dict(type='Collect', keys=['img']),
])
]
data = dict( # >>>>>>>>>> 构建dataloader对象
samples_per_gpu=2,
workers_per_gpu=2,
train=dict( # >>>>>>>>>> 构建datasets对象
type=dataset_type,
ann_file=data_root + 'annotations/instances_train2017.json',
img_prefix=data_root + 'train2017/',
pipeline=train_pipeline),
val=dict( # >>>>>>>>>> 构建datasets对象
type=dataset_type,
ann_file=data_root + 'annotations/instances_val2017.json',
img_prefix=data_root + 'val2017/',
pipeline=test_pipeline),
test=dict( # >>>>>>>>>> 构建datasets对象
type=dataset_type,
ann_file=data_root + 'annotations/instances_val2017.json',
img_prefix=data_root + 'val2017/',
pipeline=test_pipeline))
evaluation = dict(interval=1, metric='bbox')
根据上述config文件进行数据处理的过程如下:
datasets = [build_dataset(cfg.data.train)]
data_loaders = [
build_dataloader(
ds,
cfg.data.samples_per_gpu,
cfg.data.workers_per_gpu,
# cfg.gpus will be ignored if distributed
len(cfg.gpu_ids),
dist=distributed,
seed=cfg.seed) for ds in dataset
]
而dataloader定义在mmdet/dataset/builder.py中。
from torch.utils.data import DataLoader
def build_dataloader(dataset,samples_per_gpu,workers_per_gpu,
num_gpus=1,shuffle=True,seed=None,**kwargs):
sampler = GroupSampler(dataset, samples_per_gpu) if shuffle else None # shuffle为True的情况下
# 对图片的长宽比进行分组,ratio大于1为分组1,小于1为分组0,目的是为了尽可能减少pad
# 后面文章会具体介绍
batch_size = num_gpus * samples_per_gpu # 之前介绍过,mmdetection支持多GPU分布式训练
num_workers = num_gpus * workers_per_gpu # io的进程数
init_fn = partial(worker_init_fn, num_workers=num_workers, rank=0,seed=seed) if seed is not None else None
# 如果给定seed,将固定随机数,否则为None,随机数不固定
data_loader = DataLoader(
dataset,
batch_size=batch_size,
sampler=sampler, # 需要实现__len__和__iter__方法,每次返回一个整数下标索引
num_workers=num_workers,
collate_fn=partial(collate, samples_per_gpu=samples_per_gpu), # collate就是上面mmcv实现的函数
pin_memory=False,
worker_init_fn=init_fn,
**kwargs)
return data_loader
def worker_init_fn(worker_id, num_workers, rank, seed):
# pytorch的dataloader在新建时会调用该函数,功能是让随机数固定!传递的第一个参数为进程编号
worker_seed = num_workers * rank + worker_id + seed
np.random.seed(worker_seed)
random.seed(worker_seed)
mmdet中的数据增强是在datasets中完成的,具体的代码如下:
self.pipeline = Compose(pipeline)
#!!! 在getitem函数中会调用prepare_train_img函数
def pre_pipeline(self, results):
"""Prepare results dict for pipeline."""
results['img_prefix'] = self.img_prefix
results['seg_prefix'] = self.seg_prefix
results['proposal_file'] = self.proposal_file
results['bbox_fields'] = []
results['mask_fields'] = []
results['seg_fields'] = []
def prepare_train_img(self, idx):
"""Get training data and annotations after pipeline.
Args:
idx (int): Index of data.
Returns:
dict: Training data and annotation after pipeline with new keys \
introduced by pipeline.
"""
img_info = self.data_infos[idx] # 图像信息
ann_info = self.get_ann_info(idx) # 标签信息
results = dict(img_info=img_info, ann_info=ann_info)
if self.proposals is not None:
results['proposals'] = self.proposals[idx]
self.pre_pipeline(results)
return self.pipeline(results) # >>>>>>>> 进行数据增强
接下来,让我们看看传入到数据增强pipeline中的results字典最开始包含哪些内容。
img_prefix
img_info
● height
● width
● id
● filename(file_name)
anno_info
● bboxes
● labels
● bboxes_ignore
● masks
● seg_map
seg_prefix
proposal_file
bbox_fields
mask_fields
seg_fields
之后进行各种数据处理,results字典内容的变换如下
我自己简单调试画出来如下的字典内容变化图像。
从文件中加载图像数据。
保持宽高比缩放图像
dict(type='Resize', img_scale=(1300, 1300), keep_ratio=True),
if ratio_range is not None:
# scale 不变 ratio发生变化
# mode 1: given a scale and a range of image ratio
assert len(self.img_scale) == 1
else:
# scale 变化 ratio不变
# dict(img_scale=[(1333, 800), (512, 512)], multiscale_mode='value')
# mode 2: given multiple scales or a range of scales
# range: 从multiscale区间中随机采样一个scale系数
# value: 从给定的multiscale值中随机采样一个
assert multiscale_mode in ['value', 'range']
进行水平、竖直和沿对角线翻转,对图像和bbox进行翻转。有三种方式:
flip_ratio为float,direction为string,例如flip_ratio=0.5,direction=‘horizontal’,则表示图像将以0.5的概率进行水平翻转;
flip_ratio为float,direction为string列表,例如flip_ratio=0.5,direction=[‘horizontal’, ‘vertical’],则表示图像将以flip_ratio/len(direction)的概率分别进行水平和竖着翻转;
flip_ratio为float列表,direction为string列表,例如flip_ratio=[0.3, 0.5],direction=[‘horizontal’,
'vertical'],则图像将以0.3的概率进行水平翻转,0.5的概率进行竖直翻转。
进行翻转的方向direction有3种选择:‘horizontal’, ‘vertical’, ‘diagonal’,默认为’horizontal’。
dict(type='RandomFlip', flip_ratio=0.75, direction=['horizontal', 'vertical', 'diagonal']), # do all flip.
对图像数据进行归一化
dict(type='Normalize', **img_norm_cfg),
两种pad模式:
dict(type='Pad', size_divisor=32),
def _pad_img(self, results):
"""Pad images according to ``self.size``."""
for key in results.get('img_fields', ['img']):
# pad填充的都是右边和下标 原点不变
if self.size is not None: # pad到固定大小
padded_img = mmcv.impad(
results[key], shape=self.size, pad_val=self.pad_val)
elif self.size_divisor is not None: # pad到除数的最小整数倍
padded_img = mmcv.impad_to_multiple(
results[key], self.size_divisor, pad_val=self.pad_val)
results[key] = padded_img
if self.need_depth_img:
assert self.size_divisor is not None
depth_padded_img = mmcv.impad_to_multiple(results['depth_img'], self.size_divisor, pad_val=self.pad_val)
results['depth_img'] = depth_padded_img
results['pad_shape'] = padded_img.shape
results['pad_fixed_size'] = self.size
results['pad_size_divisor'] = self.size_divisor
将img、bboxes、labels转换为tensor,再转换为DataContainer。
dict(type='DefaultFormatBundle'),
将一些标注信息插入到results[‘img_metas’]中
dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
在测试集上测试时,**必须使用MultiScaleFlipAug!即使不采用多尺度测试,也要使用MultiScaleFlipAug。**因为aseDetector.forward_test只接收list。
若采用了多尺度训练,即num_augs大于1,那么batch必须为1;若num_augs等于1时,batch可以大于1.
test_pipeline = [
# dict(type='LoadImageFromFile')
dict(type='LoadImageFromFile', to_float32=True),
dict(
type='MultiScaleFlipAug',
img_scale=(1300, 1300),
flip=False,
transforms=[
dict(type='Resize', keep_ratio=True),
dict(type='RandomFlip'),
dict(type='Normalize', **img_norm_cfg),
dict(type='Pad', size_divisor=32),
dict(type='ImageToTensor', keys=['img']),
dict(type='Collect', keys=['img']),
])
]
数据容器,定义在mmcv中,主要时为了解决不同size的图像要stack到同一个batch中。其主要时维护stacked、data、pad、pad_dim、padding_value和cpu_only值。
import functools
import torch
class DataContainer:
'''
对于img_meta:cpu_only = True
对于img tensor:cpu_only = False, stack = True
对于gt_bboxes:cpu_only = False, stack = False
'''
def __init__(self,data,stack=False,padding_value=0,cpu_only=False,pad_dims=2):
self._data = data # 数据本身 任何数据类型都可以
self._cpu_only = cpu_only # 为True,表示只在cpu上,如图片大小scale,是否翻转flip等不需要放到GPU上的信息
self._stack = stack # 是否要堆叠,若为img,则为True,需要对图片进行stack
self._padding_value = padding_value # pad操作的填充值,默认0
assert pad_dims in [None, 1, 2, 3]
self._pad_dims = pad_dims
# pad_dims表示需要填充的倒数n个dim,以图片为例,BCHW需要填充是HW最后2维
def __len__(self):
return len(self._data)
@property
def data(self):
return self._data # 返回数据本身
@property
def datatype(self):
if isinstance(self.data, torch.Tensor):
return self.data.type()
else:
return type(self.data)
@property
def cpu_only(self):
return self._cpu_only # cpu上
@property
def stack(self):
return self._stack
@property
def padding_value(self):
return self._padding_value
@property
def pad_dims(self):
return self._pad_dims # padding的维度
首先得到batch个Dataset.__getitem__的结果:[(img1(1, C, H, W), bboxes1),…],将其输入到collate函数中,转换为[img(batch, C, H, W), [bboxs1, bboxes2…])。
collate实现在mmcv中,源码如下:
from collections.abc import Mapping, Sequence
import torch
import torch.nn.functional as F
from torch.utils.data.dataloader import default_collate
def collate(batch, samples_per_gpu):
'''
batch:由Batch个Dataset.__getitem__返回结果(如collect_results)组成的list
# 为了防止混淆!!小写batch表示输入的参数,大写Batch表示len(batch)即一批量中样本个数
samples_per_gpu:int,每块GPU上的样本个数
# 有意思,mmdetection是能支持多GPU的,这意味着,在数据分发时,我们应该把数据分为num_gpus份
# 做个简单的数学运算Batch = num_gpus * sampler_per_gpu!!!,当是单GPU时,Batch = sampler_per_gpu
'''
if not isinstance(batch, Sequence):
raise TypeError(f'{batch.dtype} is not supported.')
if isinstance(batch[0], DataContainer):
stacked = []
if batch[0].cpu_only:
for i in range(0, len(batch), samples_per_gpu):
stacked.append([sample.data for sample in batch[i:i + samples_per_gpu]])
return DataContainer(stacked, batch[0].stack, batch[0].padding_value, cpu_only=True)
elif batch[0].stack:
for i in range(0, len(batch), samples_per_gpu):
assert isinstance(batch[i].data, torch.Tensor)
if batch[i].pad_dims is not None:
ndim = batch[i].dim()
assert ndim > batch[i].pad_dims
max_shape = [0 for _ in range(batch[i].pad_dims)]
for sample in batch[i:i + samples_per_gpu]:
for dim in range(0, ndim - batch[i].pad_dims):
assert batch[i].size(dim) == sample.size(dim) # 除了需要pad的维度,其他维度要一致
for dim in range(1, batch[i].pad_dims + 1):
# 遍历所有维度求max,因为要stack,所以维度要一样,用pad
max_shape[dim - 1] = max(max_shape[dim - 1],sample.size(-dim))
padded_samples = []
for sample in batch[i:i + samples_per_gpu]:
pad = [0 for _ in range(batch[i].pad_dims * 2)]
for dim in range(1, batch[i].pad_dims + 1):
pad[2 * dim - 1] = max_shape[dim - 1] - sample.size(-dim)
# bbox不需要偏移
padded_samples.append(F.pad(sample.data, pad, value=sample.padding_value))
stacked.append(default_collate(padded_samples))
# default_collate是pytorch实现的,就是做stack,[(1,C,H,W),(1,C,H,W)……]变为(B,C,H,W)
elif batch[i].pad_dims is None:
stacked.append(default_collate([sample.data for sample in batch[i:i + samples_per_gpu]]))
else:
raise ValueError('pad_dims should be either None or integers (1-3)')
else:
# 非stack,返回list
for i in range(0, len(batch), samples_per_gpu):
stacked.append([sample.data for sample in batch[i:i + samples_per_gpu]])
return DataContainer(stacked, batch[0].stack, batch[0].padding_value)
elif isinstance(batch[0], Sequence):
transposed = zip(*batch)
return [collate(samples, samples_per_gpu) for samples in transposed]
elif isinstance(batch[0], Mapping):
return {
key: collate([d[key] for d in batch], samples_per_gpu) # 递归调用
for key in batch[0]
}
else:
return default_collate(batch)
collate对图像进行数据分发时,会把不同size的图像进行pad。这里的pad操作会把图片从左上角对齐,即填充右边和下边。
在collate中需要将不同size的图片padding到一样的大小,为了尽可能减少pad面积,节省算力,Groupsampler类应运而生。通过将未处理图片根据ratio分为两组,ratio大于1分为一组,ratio小于1的分为另一组。相似ratio图片的输入能有效减少pad面积。
通过dataset得到数据集的index,每个图像对应一个index,sampler指的是采样方法,指的是每个batch采样哪些index,collate_fn是对采样的图像进行处理,最后得到batch数据用于训练测试。
class GroupSampler(Sampler):
def __init__(self, dataset, samples_per_gpu=1):
assert hasattr(dataset, 'flag')
self.dataset = dataset
self.samples_per_gpu = samples_per_gpu
self.flag = dataset.flag.astype(np.int64)
# flag标志由dataset在初始化时确定,详见customdataset
# flag只有两个取值,根据ratio是否大于1,分为两组
self.group_sizes = np.bincount(self.flag) # 对每组的数量进行计数,详见bincount的使用方法
self.num_samples = 0 # 作为__len__的返回值
for i, size in enumerate(self.group_sizes):
self.num_samples += int(np.ceil(size / self.samples_per_gpu)) * self.samples_per_gpu
# group_size不一定能确保被samples_per_gpu整除,因此需要向上取整
# 比如分组0的数量是100个,分组1的数量是200个,samples_per_gpu为29
# 那么num_samples = 116+203 = 319
def __iter__(self): # 返回迭代器,每次迭代返回一个整数索引
indices = []
for i, size in enumerate(self.group_sizes):
if size == 0:
continue
indice = np.where(self.flag == i)[0] # 获得同组的图片下标
assert len(indice) == size
np.random.shuffle(indice) # 打乱
num_extra = int(np.ceil(size / self.samples_per_gpu)) * self.samples_per_gpu - len(indice)
indice = np.concatenate([indice, np.random.choice(indice, num_extra)])
indices.append(indice)
# 还是以"分组0的数量是100个,分组1的数量是200个,samples_per_gpu为29"举例,num_samples = 116+203 = 319
# 116大于100,203大于200,所以我们需要还额外增加下标
# 最后得到319个下标,其中前116个是分组0,后203个是分组1,确保每samples_per_gpu都是同一ratio
indices = np.concatenate(indices)
indices = [
indices[i * self.samples_per_gpu:(i + 1) * self.samples_per_gpu]
for i in np.random.permutation(range(len(indices) // self.samples_per_gpu))
]
indices = np.concatenate(indices)
indices = indices.astype(np.int64).tolist()
assert len(indices) == self.num_samples
return iter(indices)
def __len__(self):
return self.num_samples
在本文中,介绍了mmdet框架下数据处理的流程,将数据增强、数据集构建和dataloader构建的顺序和关系理顺,并简单介绍了常见数据增强内容。