目标检测是计算机视觉任务中的一个重要研究方向,其用于解决对数码图像中特定种类的可视目标实例的检测问题。目标检测作为计算机视觉的根本性问题之一,是其他诸多计算机视觉任务,例如图像描述生成,实例分割和目标跟踪的基础以及前提。而在解决此类问题时,我们常常需要使用自己的脚本或者利用标注工具生成数据集,数据集格式往往会多种多样,因此对于目标检测任务而言,为了更好地兼容训练,大多数目标检测模型框架会默认支持几种常用的数据集标注格式,常见的分别是COCO,Pascal VOC,YOLO等等。本文主要介绍上述几种数据集格式以及我写的Python脚本(一般需要根据实际情况再改改)。
1. COCO
1.1 COCO数据集格式
COCO(Common Objects in COtext)数据集,是一个大规模的,适用于目标检测,图像分割,Image Captioning任务的数据集,其标注格式是最常用的几种格式之一。目前使用较多的是COCO2017数据集。其官网为COCO - Common Objects in Context (cocodataset.org)。
COCO数据集主要包含图像(jpg或者png等等)和标注文件(json),其数据集格式如下(代表文件夹):/
-coco/
|-train2017/
|-1.jpg
|-2.jpg
|-val2017/
|-3.jpg
|-4.jpg
|-test2017/
|-5.jpg
|-6.jpg
|-annotations/
|-instances_train2017.json
|-instances_val2017.json
|-*.json
train2017
以及这两个文件夹中存储的是训练集和验证集的图像,而文件夹中存储的是测试集的信息,可以只是图像,也可以包含标注,一般是单独使用的。val2017test2017
annotations
文件夹中的文件就是标注文件,如果你有文件,通常需要转换成格式,其格式如下(更详细的可以参考官网):xmljson
{
"info": info,
"images": [image], //列表
"annotations": [annotation], //列表
"categories": [category], //列表
"licenses": [license], //列表
}
其中为整个数据集的信息,包括年份,版本,描述等等信息,如果只是完成训练任务,其实不太重要,如下所示:info
//对于训练,不是那么的重要
info{
"year": int,
"version": str,
"description": str,
"contributor": str,
"url": str,
"date_created": datetime,
}
其中的为图像的基本信息,包括序号,宽高,文件名等等信息,其中的序号()需要和后面的中的标注所属图片序号对应如下所示:imageidannotations
image{
"id": int, //必要
"width": int, //必要
"height": int, //必要
"file_name": str, //必要
"license": int,
"flickr_url": str,
"coco_url": str,
"date_captured": datetime,
}
其中的是最重要的标注信息,包括序号,所属图像序号,类别序号等等信息,如下所示:annotation
annotation{
"id": int, //标注id
"image_id": int, //所属图像id
"category_id": int, //类别id
"segmentation": RLE or [polygon], //图像分割标注
"area": float, //区域面积
"bbox": [x,y,width,height], //目标框左上角坐标以及宽高
"iscrowd": 0 or 1, //是否密集
}
其中的代表类别信息,包括父类别,类别序号以及类别名称,如下所示:category
category{
"id": int, //类别序号
"name": str, //类别名称
"supercategory": str, //父类别
}
其中的代表数据集的协议许可信息,包括序号,协议名称以及链接信息,如下所示:license
//对于训练,不重要
license{
"id": int,
"name": str,
"url": str,
}
接下来,我们来看一个简单的示例:
代码语言:javascript复制{
"info": {略}, "images": [{"id": 1, "file_name": "1.jpg", "height": 334, "width": 500}, {"id": 2, "file_name": "2.jpg", "height": 445, "width": 556}], "annotations": [{"id": 1, "area": 40448, "iscrowd": 0, "image_id": 1, "bbox": [246, 61, 128, 316], "category_id": 3, "segmentation": []}, {"id": 2, "area": 40448, "iscrowd": 0, "image_id": 1, "bbox": [246, 61, 128, 316], "category_id": 2, "segmentation": []}, {"id": 3, "area": 40448, "iscrowd": 0, "image_id": 2, "bbox": [246, 61, 128, 316], "category_id": 1, "segmentation": []}], "categories": [{"supercategory": "none", "id": 1, "name": "liner"},{"supercategory": "none", "id": 2, "name": "containership"},{"supercategory": "none", "id": 3, "name": "bulkcarrier"}], "licenses": [{略}]
}
1.2 COCO转换脚本
Python转换脚本
如下所示,需要准备和标注文件:图像xml
# -*- coding: utf-8 -*-
# @Author : justlovesmile
# @Date : 2021/9/8 15:36
import os, random, json
import shutil as sh
from tqdm.auto import tqdm
import xml.etree.ElementTree as xmlET
def mkdir(path):
if not os.path.exists(path):
os.makedirs(path)
return True
else:
print(f"The path ({path}) already exists.")
return False
def readxml(file):
tree = xmlET.parse(file)
#图片尺寸字段
size = tree.find('size')
width = int(size.find('width').text)
height = int(size.find('height').text)
#目标字段
objs = tree.findall('object')
bndbox = []
for obj in objs:
label = obj.find("name").text
bnd = obj.find("bndbox")
xmin = int(bnd.find("xmin").text)
ymin = int(bnd.find("ymin").text)
xmax = int(bnd.find("xmax").text)
ymax = int(bnd.find("ymax").text)
bbox = [xmin, ymin, xmax, ymax, label]
bndbox.append(bbox)
return [[width, height], bndbox]
def tococo(xml_root, image_root, output_root,classes={},errorId=[],train_percent=0.9):
# assert
assert train_percent<=1 and len(classes)>0
# define the root path
train_root = os.path.join(output_root, "train2017")
val_root = os.path.join(output_root, "val2017")
ann_root = os.path.join(output_root, "annotations")
# initialize train and val dict
train_content = {
"images": [], # {"file_name": "09780.jpg", "height": 334, "width": 500, "id": 9780}
"annotations": [],# {"area": 40448, "iscrowd": 0, "image_id": 1, "bbox": [246, 61, 128, 316], "category_id": 5, "id": 1, "segmentation": []}
"categories": [] # {"supercategory": "none", "id": 1, "name": "liner"}
}
val_content = {
"images": [], # {"file_name": "09780.jpg", "height": 334, "width": 500, "id": 9780}
"annotations": [],# {"area": 40448, "iscrowd": 0, "image_id": 1, "bbox": [246, 61, 128, 316], "category_id": 5, "id": 1, "segmentation": []}
"categories": [] # {"supercategory": "none", "id": 1, "name": "liner"}
}
train_json = 'instances_train2017.json'
val_json = 'instances_val2017.json'
# divide the trainset and valset
images = os.listdir(image_root)
total_num = len(images)
train_percent = train_percent
train_num = int(total_num * train_percent)
train_file = sorted(random.sample(images, train_num))
if mkdir(output_root):
if mkdir(train_root) and mkdir(val_root) and mkdir(ann_root):
idx1, idx2, dx1, dx2 = 0, 0, 0, 0
for file in tqdm(images):
name=os.path.splitext(os.path.basename(file))[0]
if name not in errorId:
res = readxml(os.path.join(xml_root, name '.xml'))
if file in train_file:
idx1 = 1
sh.copy(os.path.join(image_root, file), train_root)
train_content['images'].append(
{"file_name": file, "width": res[0][0], "height": res[0][1], "id": idx1})
for b in res[1]:
dx1 = 1
x = b[0]
y = b[1]
w = b[2] - b[0]
h = b[3] - b[1]
train_content['annotations'].append(
{"area": w * h, "iscrowd": 0, "image_id": idx1, "bbox": [x, y, w, h],
"category_id": classes[b[4]], "id": dx1, "segmentation": []})
else:
idx2 = 1
sh.copy(os.path.join(image_root, file), val_root)
val_content['images'].append(
{"file_name": file, "width": res[0][0], "height": res[0][1], "id": idx2})
for b in res[1]:
dx2 = 1
x = b[0]
y = b[1]
w = b[2] - b[0]
h = b[3] - b[1]
val_content['annotations'].append(
{"area": w * h, "iscrowd": 0, "image_id": idx2, "bbox": [x, y, w, h],
"category_id": classes[b[4]], "id": dx2, "segmentation": []})
for i, j in classes.items():
train_content['categories'].append({"supercategory": "none", "id": j, "name": i})
val_content['categories'].append({"supercategory": "none", "id": j, "name": i})
with open(os.path.join(ann_root, train_json), 'w') as f:
json.dump(train_content, f)
with open(os.path.join(ann_root, val_json), 'w') as f:
json.dump(val_content, f)
print("Number of Train Images:", len(os.listdir(train_root)))
print("Number of Val Images:", len(os.listdir(val_root)))
def test():
box_root = "E:/MyProject/Dataset/hwtest/annotations" #xml文件夹
image_root = "E:/MyProject/Dataset/hwtest/images" #image文件夹
output_root = "E:/MyProject/Dataset/coco" #输出文件夹
classes = {"liner": 0,"bulk carrier": 1,"warship": 2,"sailboat": 3,"canoe": 4,"container ship": 5,"fishing boat": 6} #类别字典
errorId = [] #脏数据id
train_percent = 0.9 #训练集和验证集比例
tococo(box_root, image_root, output_root,classes=classes,errorId=errorId,train_percent=train_percent)
if __name__ == "__main__":
test()
复制
2. VOC
2.1 VOC数据集格式
VOC(Visual Object Classes)数据集来源于PASCAL VOC挑战赛,其主要任务有 、、、、。其官网为The PASCAL Visual Object Classes Homepage (ox.ac.uk)。其主要数据集有VOC2007以及VOC2012。Object ClassificationObject DetectionObject SegmentationHuman LayoutAction Classification
VOC数据集主要包含图像(jpg或者png等等)和标注文件(xml),其数据集格式如下(代表文件夹):/
-VOC/
|-JPEGImages/
|-1.jpg
|-2.jpg
|-Annotations/
|-1.xml
|-2.xml
|-ImageSets/
|-Layout/
|-*.txt
|-Main/
|-train.txt
|-val.txt
|-trainval.txt
|-test.txt
|-Segmentation/
|-*.txt
|-Action/
|-*.txt
|-SegmentationClass/
|-SegmentationObject/
其中对于目标检测任务而言,最常用的以及必须的文件夹包括:,,。JPEGImagesAnnotationsImageSets/Main
JPEGImages
里存放的是图像,而里存放的是标注文件,文件内容如下:Annotationsxml
<annotation>
<folder>VOC</folder> # 图像所在文件夹
<filename>000032.jpg</filename> # 图像文件名
<source> # 图像源
<database>The VOC Database</database>
<annotation>PASCAL VOC</annotation>
<image>flickr</image>
</source>
<size> # 图像尺寸信息
<width>500</width> # 图像宽度
<height>281</height> # 图像高度
<depth>3</depth> # 图像通道数
</size>
<segmented>0</segmented> # 图像是否用于分割,0代表不适用,对目标检测而言没关系
<object> # 一个目标对象的信息
<name>aeroplane</name> # 目标的类别名
<pose>Frontal</pose> # 拍摄角度,若无一般为Unspecified
<truncated>0</truncated> # 是否被截断,0表示完整未截断
<difficult>0</difficult> # 是否难以识别,0表示不难识别
<bndbox> # 边界框信息
<xmin>104</xmin> # 左上角x
<ymin>78</ymin> # 左上角y
<xmax>375</xmax> # 右下角x
<ymax>183</ymax> # 右下角y
</bndbox>
</object>
# 下面是其他目标的信息,这里略掉
<object>
其他object信息,这里省略
</object>
</annotation>
2.2 VOC转换脚本
下面这个脚本,只适用于有图像和xml文件的情况下,coco转voc格式以后有需要再写:
代码语言:javascript复制# -*- coding: utf-8 -*-
# @Author : justlovesmile
# @Date : 2021/9/8 21:01
import os,random
from tqdm.auto import tqdm
import shutil as sh
def mkdir(path):
if not os.path.exists(path):
os.mkdir(path)
return True
else:
print(f"The path ({path}) already exists.")
return False
def tovoc(xmlroot,imgroot,saveroot,errorId=[],classes={},tvp=1.0,trp=0.9):
'''
参数:
root:数据集存放根目录
功能:
加载数据,并保存为VOC格式
加载后的格式:
VOC/
Annotations/
- **.xml
JPEGImages/
- **.jpg
ImageSets/
Main/
- train.txt
- test.txt
- val.txt
- trainval.txt
'''
# assert
assert len(classes)>0
# init path
VOC = saveroot
ann_path = os.path.join(VOC, 'Annotations')
img_path = os.path.join(VOC,'JPEGImages')
set_path = os.path.join(VOC,'ImageSets')
txt_path = os.path.join(set_path,'Main')
# mkdirs
if mkdir(VOC):
if mkdir(ann_path) and mkdir(img_path) and mkdir(set_path):
mkdir(txt_path)
images = os.listdir(imgroot)
list_index = range(len(images))
#test and trainval set
trainval_percent = tvp
train_percent = trp
val_percent = 1 - train_percent if train_percent<1 else 0.1
total_num = len(images)
trainval_num = int(total_num*trainval_percent)
train_num = int(trainval_num*train_percent)
val_num = int(trainval_num*val_percent) if train_percent<1 else 0
trainval = random.sample(list_index,trainval_num)
train = random.sample(list_index,train_num)
val = random.sample(list_index,val_num)
for i in tqdm(list_index):
imgfile = images[i]
img_id = os.path.splitext(os.path.basename(imgfile))[0]
xmlfile = img_id ".xml"
sh.copy(os.path.join(imgroot,imgfile),os.path.join(img_path,imgfile))
sh.copy(os.path.join(xmlroot,xmlfile),os.path.join(ann_path,xmlfile))
if img_id not in errorId:
if i in trainval:
with open(os.path.join(txt_path,'trainval.txt'),'a') as f:
f.write(img_id 'n')
if i in train:
with open(os.path.join(txt_path,'train.txt'),'a') as f:
f.write(img_id 'n')
else:
with open(os.path.join(txt_path,'val.txt'),'a') as f:
f.write(img_id 'n')
if train_percent==1 and i in val:
with open(os.path.join(txt_path,'val.txt'),'a') as f:
f.write(img_id 'n')
else:
with open(os.path.join(txt_path,'test.txt'),'a') as f:
f.write(img_id 'n')
# end
print("Dataset to VOC format finished!")
def test():
box_root = "E:/MyProject/Dataset/hwtest/annotations"
image_root = "E:/MyProject/Dataset/hwtest/images"
output_root = "E:/MyProject/Dataset/voc"
classes = {"liner": 0,"bulk carrier": 1,"warship": 2,"sailboat": 3,"canoe": 4,"container ship": 5,"fishing boat": 6}
errorId = []
train_percent = 0.9
tovoc(box_root,image_root,output_root,errorId,classes,trp=train_percent)
if __name__ == "__main__":
test()
3. YOLO
3.1 YOLO数据集格式
YOLO
数据集格式的出现主要是为了训练模型,其文件格式没有固定的要求,因为可以通过修改模型的配置文件进行数据加载,唯一需要注意的是数据集的标注格式是将目标框的位置信息进行归一化处理(此处归一化指的是除以图片宽和高),如下所示:YOLOYOLO
{目标类别} {归一化后的目标中心点x坐标} {归一化后的目标中心点y坐标} {归一化后的目标框宽度w} {归一化后的目标框高度h}
3.2 YOLO转换脚本
Python
转换脚本如下所示:
# -*- coding: utf-8 -*-
# @Author : justlovesmile
# @Date : 2021/9/8 20:28
import os
import random
from tqdm.auto import tqdm
import shutil as sh
try:
import xml.etree.cElementTree as et
except ImportError:
import xml.etree.ElementTree as et
def mkdir(path):
if not os.path.exists(path):
os.makedirs(path)
return True
else:
print(f"The path ({path}) already exists.")
return False
def xml2yolo(xmlpath,savepath,classes={}):
namemap = classes
#try:
# with open('classes_yolo.json','r') as f:
# namemap=json.load(f)
#except:
# pass
rt = et.parse(xmlpath).getroot()
w = int(rt.find("size").find("width").text)
h = int(rt.find("size").find("height").text)
with open(savepath, "w") as f:
for obj in rt.findall("object"):
name = obj.find("name").text
xmin = int(obj.find("bndbox").find("xmin").text)
ymin = int(obj.find("bndbox").find("ymin").text)
xmax = int(obj.find("bndbox").find("xmax").text)
ymax = int(obj.find("bndbox").find("ymax").text)
f.write(
f"{namemap[name]} {(xmin xmax)/w/2.} {(ymin ymax)/h/2.} {(xmax-xmin)/w} {(ymax-ymin)/h}"
"n"
)
def trainval(xmlroot,imgroot,saveroot,errorId=[],classes={},tvp=1.0,trp=0.9):
# assert
assert tvp<=1.0 and trp <=1.0 and len(classes)>0
# create dirs
imglabel = ['images','labels']
trainvaltest = ['train','val','test']
mkdir(saveroot)
for r in imglabel:
mkdir(os.path.join(saveroot,r))
for s in trainvaltest:
mkdir(os.path.join(saveroot,r,s))
#train / val
trainval_percent = tvp
train_percent = trp
val_percent = 1 - train_percent if train_percent<1.0 else 0.15
total_img = os.listdir(imgroot)
num = len(total_img)
list_index = range(num)
tv = int(num * trainval_percent)
tr = int(tv * train_percent)
va = int(tv * val_percent)
trainval = random.sample(list_index, tv) # trainset and valset
train = random.sample(trainval, tr) # trainset
val = random.sample(trainval, va) #valset, use it only when train_percent = 1
print(f"trainval_percent:{trainval_percent},train_percent:{train_percent},val_percent:{val_percent}")
for i in tqdm(list_index):
name = total_img[i]
op = os.path.join(imgroot,name)
file_id = os.path.splitext(os.path.basename(name))[0]
if file_id not in errorId:
xmlp = os.path.join(xmlroot,file_id '.xml')
if i in trainval:
# trainset and valset
if i in train:
sp = os.path.join(saveroot,"images","train",name)
xml2yolo(xmlp,os.path.join(saveroot,"labels","train",file_id '.txt'),classes)
sh.copy(op,sp)
else:
sp = os.path.join(saveroot,"images","val",name)
xml2yolo(xmlp,os.path.join(saveroot,"labels","val",file_id '.txt'),classes)
sh.copy(op,sp)
if (train_percent==1.0 and i in val):
sp = os.path.join(saveroot,"images","val",name)
xml2yolo(xmlp,os.path.join(saveroot,"labels","val",file_id '.txt'),classes)
sh.copy(op,sp)
else:
# testset
sp = os.path.join(saveroot,"images","test",name)
xml2yolo(xmlp,os.path.join(saveroot,"labels","test",file_id '.txt'),classes)
sh.copy(op,sp)
def maketxt(dir,saveroot,filename):
savetxt = os.path.join(saveroot,filename)
with open(savetxt,'w') as f:
for i in tqdm(os.listdir(dir)):
f.write(os.path.join(dir,i) 'n')
def toyolo(xmlroot,imgroot,saveroot,errorId=[],classes={},tvp=1,train_percent=0.9):
# toyolo main function
trainval(xmlroot,imgroot,saveroot,errorId,classes,tvp,train_percent)
maketxt(os.path.join(saveroot,"images","train"),saveroot,"train.txt")
maketxt(os.path.join(saveroot,"images","val"),saveroot,"val.txt")
maketxt(os.path.join(saveroot,"images","test"),saveroot,"test.txt")
print("Dataset to yolo format success.")
def test():
box_root = "E:/MyProject/Dataset/hwtest/annotations"
image_root = "E:/MyProject/Dataset/hwtest/images"
output_root = "E:/MyProject/Dataset/yolo"
classes = {"liner": 0,"bulk carrier": 1,"warship": 2,"sailboat": 3,"canoe": 4,"container ship": 5,"fishing boat": 6}
errorId = []
train_percent = 0.9
toyolo(box_root,image_root,output_root,errorId,classes,train_percent=train_percent)
if __name__ == "__main__":
test()
按照此脚本,将会在输出文件夹中生成以下内容:
代码语言:javascript复制-yolo/
|-images/
|-train/
|-1.jpg
|-2.jpg
|-test/
|-3.jpg
|-4.jpg
|-val/
|-5.jpg
|-6.jpg
|-labels/
|-train/
|-1.txt
|-2.txt
|-test/
|-3.txt
|-4.txt
|-val/
|-5.txt
|-6.txt
|-train.txt
|-test.txt
|-val.txt