目标检测 |常用数据集标注格式及生成脚本

目标检测是计算机视觉任务中的一个重要研究方向，其用于解决对数码图像中特定种类的可视目标实例的检测问题。目标检测作为计算机视觉的根本性问题之一，是其他诸多计算机视觉任务，例如图像描述生成，实例分割和目标跟踪的基础以及前提。而在解决此类问题时，我们常常需要使用自己的脚本或者利用标注工具生成数据集，数据集格式往往会多种多样，因此对于目标检测任务而言，为了更好地兼容训练，大多数目标检测模型框架会默认支持几种常用的数据集标注格式，常见的分别是COCO，Pascal VOC，YOLO等等。本文主要介绍上述几种数据集格式以及我写的Python脚本（一般需要根据实际情况再改改）。

1. COCO

1.1 COCO数据集格式

COCO（Common Objects in COtext）数据集，是一个大规模的，适用于目标检测，图像分割，Image Captioning任务的数据集，其标注格式是最常用的几种格式之一。目前使用较多的是COCO2017数据集。其官网为COCO - Common Objects in Context (cocodataset.org)。

COCO数据集主要包含图像（jpg或者png等等）和标注文件（json），其数据集格式如下(代表文件夹)：/

代码语言：javascript复制

-coco/
    |-train2017/
    	|-1.jpg
    	|-2.jpg
    |-val2017/
    	|-3.jpg
    	|-4.jpg
    |-test2017/
    	|-5.jpg
    	|-6.jpg
    |-annotations/
    	|-instances_train2017.json
    	|-instances_val2017.json
    	|-*.json

train2017以及这两个文件夹中存储的是训练集和验证集的图像，而文件夹中存储的是测试集的信息，可以只是图像，也可以包含标注，一般是单独使用的。val2017test2017

annotations文件夹中的文件就是标注文件，如果你有文件，通常需要转换成格式，其格式如下（更详细的可以参考官网）：xmljson

代码语言：javascript复制

{
	"info": info, 
	"images": [image], //列表
	"annotations": [annotation], //列表
	"categories": [category], //列表
	"licenses": [license], //列表
}

其中为整个数据集的信息，包括年份，版本，描述等等信息，如果只是完成训练任务，其实不太重要，如下所示：info

代码语言：javascript复制

//对于训练，不是那么的重要
info{
	"year": int, 
	"version": str, 
	"description": str, 
	"contributor": str, 
	"url": str, 
	"date_created": datetime,
}

其中的为图像的基本信息，包括序号，宽高，文件名等等信息，其中的序号（）需要和后面的中的标注所属图片序号对应如下所示：imageidannotations

代码语言：javascript复制

image{
	"id": int, //必要
	"width": int, //必要
	"height": int, //必要
	"file_name": str, //必要
	"license": int,
	"flickr_url": str,
	"coco_url": str,
	"date_captured": datetime, 
}

其中的是最重要的标注信息，包括序号，所属图像序号，类别序号等等信息，如下所示：annotation

代码语言：javascript复制

annotation{
	"id": int, //标注id
	"image_id": int, //所属图像id
	"category_id": int, //类别id
	"segmentation": RLE or [polygon], //图像分割标注
	"area": float, //区域面积
	"bbox": [x,y,width,height], //目标框左上角坐标以及宽高
	"iscrowd": 0 or 1, //是否密集
}

其中的代表类别信息，包括父类别，类别序号以及类别名称，如下所示：category

代码语言：javascript复制

category{
	"id": int, //类别序号
	"name": str, //类别名称
	"supercategory": str, //父类别
}

其中的代表数据集的协议许可信息，包括序号，协议名称以及链接信息，如下所示：license

代码语言：javascript复制

//对于训练，不重要
license{
	"id": int, 
	"name": str, 
	"url": str,
}

接下来，我们来看一个简单的示例：

代码语言：javascript复制

{
"info": {略}, "images": [{"id": 1, "file_name": "1.jpg", "height": 334, "width": 500}, {"id": 2, "file_name": "2.jpg", "height": 445, "width": 556}], "annotations": [{"id": 1, "area": 40448, "iscrowd": 0, "image_id": 1, "bbox": [246, 61, 128, 316], "category_id": 3, "segmentation": []}, {"id": 2, "area": 40448, "iscrowd": 0, "image_id": 1, "bbox": [246, 61, 128, 316], "category_id": 2, "segmentation": []}, {"id": 3, "area": 40448, "iscrowd": 0, "image_id": 2, "bbox": [246, 61, 128, 316], "category_id": 1, "segmentation": []}], "categories": [{"supercategory": "none", "id": 1, "name": "liner"},{"supercategory": "none", "id": 2, "name": "containership"},{"supercategory": "none", "id": 3, "name": "bulkcarrier"}], "licenses": [{略}]
}

1.2 COCO转换脚本

Python转换脚本如下所示，需要准备和标注文件：图像xml

代码语言：javascript复制

# -*- coding: utf-8 -*-
# @Author    : justlovesmile
# @Date      : 2021/9/8 15:36
import os, random, json
import shutil as sh
from tqdm.auto import tqdm
import xml.etree.ElementTree as xmlET

def mkdir(path):
    if not os.path.exists(path):
        os.makedirs(path)
        return True
    else:
        print(f"The path ({path}) already exists.")
        return False

def readxml(file):
    tree = xmlET.parse(file)
    #图片尺寸字段
    size = tree.find('size')
    width = int(size.find('width').text)
    height = int(size.find('height').text)
    #目标字段
    objs = tree.findall('object')
    bndbox = []
    for obj in objs:
        label = obj.find("name").text
        bnd = obj.find("bndbox")
        xmin = int(bnd.find("xmin").text)
        ymin = int(bnd.find("ymin").text)
        xmax = int(bnd.find("xmax").text)
        ymax = int(bnd.find("ymax").text)
        bbox = [xmin, ymin, xmax, ymax, label]
        bndbox.append(bbox)
    return [[width, height], bndbox]

def tococo(xml_root, image_root, output_root,classes={},errorId=[],train_percent=0.9):
    # assert
    assert train_percent<=1 and len(classes)>0
    # define the root path
    train_root = os.path.join(output_root, "train2017")
    val_root = os.path.join(output_root, "val2017")
    ann_root = os.path.join(output_root, "annotations")
    # initialize train and val dict
    train_content = {
        "images": [],  # {"file_name": "09780.jpg", "height": 334, "width": 500, "id": 9780}
        "annotations": [],# {"area": 40448, "iscrowd": 0, "image_id": 1, "bbox": [246, 61, 128, 316], "category_id": 5, "id": 1, "segmentation": []}
        "categories": []  # {"supercategory": "none", "id": 1, "name": "liner"}
    }
    val_content = {
        "images": [],  # {"file_name": "09780.jpg", "height": 334, "width": 500, "id": 9780}
        "annotations": [],# {"area": 40448, "iscrowd": 0, "image_id": 1, "bbox": [246, 61, 128, 316], "category_id": 5, "id": 1, "segmentation": []}
        "categories": []  # {"supercategory": "none", "id": 1, "name": "liner"}
    }
    train_json = 'instances_train2017.json'
    val_json = 'instances_val2017.json'
    # divide the trainset and valset
    images = os.listdir(image_root)
    total_num = len(images)
    train_percent = train_percent
    train_num = int(total_num * train_percent)
    train_file = sorted(random.sample(images, train_num))
    if mkdir(output_root):
        if mkdir(train_root) and mkdir(val_root) and mkdir(ann_root):
            idx1, idx2, dx1, dx2 = 0, 0, 0, 0
            for file in tqdm(images):
                name=os.path.splitext(os.path.basename(file))[0]
                if name not in errorId:
                    res = readxml(os.path.join(xml_root, name   '.xml'))
                    if file in train_file:
                        idx1  = 1
                        sh.copy(os.path.join(image_root, file), train_root)
                        train_content['images'].append(
                            {"file_name": file, "width": res[0][0], "height": res[0][1], "id": idx1})
                        for b in res[1]:
                            dx1  = 1
                            x = b[0]
                            y = b[1]
                            w = b[2] - b[0]
                            h = b[3] - b[1]
                            train_content['annotations'].append(
                                {"area": w * h, "iscrowd": 0, "image_id": idx1, "bbox": [x, y, w, h],
                                 "category_id": classes[b[4]], "id": dx1, "segmentation": []})
                    else:
                        idx2  = 1
                        sh.copy(os.path.join(image_root, file), val_root)
                        val_content['images'].append(
                            {"file_name": file, "width": res[0][0], "height": res[0][1], "id": idx2})
                        for b in res[1]:
                            dx2  = 1
                            x = b[0]
                            y = b[1]
                            w = b[2] - b[0]
                            h = b[3] - b[1]
                            val_content['annotations'].append(
                                {"area": w * h, "iscrowd": 0, "image_id": idx2, "bbox": [x, y, w, h],
                                 "category_id": classes[b[4]], "id": dx2, "segmentation": []})
            for i, j in classes.items():
                train_content['categories'].append({"supercategory": "none", "id": j, "name": i})
                val_content['categories'].append({"supercategory": "none", "id": j, "name": i})
            with open(os.path.join(ann_root, train_json), 'w') as f:
                json.dump(train_content, f)
            with open(os.path.join(ann_root, val_json), 'w') as f:
                json.dump(val_content, f)
    print("Number of Train Images:", len(os.listdir(train_root)))
    print("Number of Val Images:", len(os.listdir(val_root)))
    
    
def test():
    box_root = "E:/MyProject/Dataset/hwtest/annotations" #xml文件夹
    image_root = "E:/MyProject/Dataset/hwtest/images" #image文件夹
    output_root = "E:/MyProject/Dataset/coco" #输出文件夹
    classes = {"liner": 0,"bulk carrier": 1,"warship": 2,"sailboat": 3,"canoe": 4,"container ship": 5,"fishing boat": 6} #类别字典
    errorId = [] #脏数据id
    train_percent = 0.9 #训练集和验证集比例
    tococo(box_root, image_root, output_root,classes=classes,errorId=errorId,train_percent=train_percent)

if __name__ == "__main__":
    test()

复制

2. VOC

2.1 VOC数据集格式

VOC（Visual Object Classes）数据集来源于PASCAL VOC挑战赛，其主要任务有、、、、。其官网为The PASCAL Visual Object Classes Homepage (ox.ac.uk)。其主要数据集有VOC2007以及VOC2012。Object ClassificationObject DetectionObject SegmentationHuman LayoutAction Classification

VOC数据集主要包含图像（jpg或者png等等）和标注文件（xml），其数据集格式如下(代表文件夹)：/

代码语言：javascript复制

-VOC/
	|-JPEGImages/
		|-1.jpg
		|-2.jpg
	|-Annotations/
		|-1.xml
		|-2.xml
	|-ImageSets/
		|-Layout/
			|-*.txt
		|-Main/
			|-train.txt
			|-val.txt
			|-trainval.txt
			|-test.txt
		|-Segmentation/
			|-*.txt
		|-Action/
			|-*.txt
	|-SegmentationClass/
	|-SegmentationObject/

其中对于目标检测任务而言，最常用的以及必须的文件夹包括：，，。JPEGImagesAnnotationsImageSets/Main

JPEGImages里存放的是图像，而里存放的是标注文件，文件内容如下：Annotationsxml

代码语言：javascript复制

<annotation>
	<folder>VOC</folder>            # 图像所在文件夹
	<filename>000032.jpg</filename> # 图像文件名
	<source>                        # 图像源
		<database>The VOC Database</database>
		<annotation>PASCAL VOC</annotation>
		<image>flickr</image>
	</source>
	<size>                          # 图像尺寸信息
		<width>500</width>    # 图像宽度
		<height>281</height>  # 图像高度
		<depth>3</depth>      # 图像通道数
	</size>
	<segmented>0</segmented>  # 图像是否用于分割，0代表不适用，对目标检测而言没关系
	<object>                  # 一个目标对象的信息
		<name>aeroplane</name>    # 目标的类别名
		<pose>Frontal</pose>      # 拍摄角度，若无一般为Unspecified
		<truncated>0</truncated>  # 是否被截断，0表示完整未截断
		<difficult>0</difficult>  # 是否难以识别，0表示不难识别
		<bndbox>            # 边界框信息
			<xmin>104</xmin>  # 左上角x
			<ymin>78</ymin>   # 左上角y
			<xmax>375</xmax>  # 右下角x
			<ymax>183</ymax>  # 右下角y
		</bndbox>
	</object>
    # 下面是其他目标的信息，这里略掉
	<object>
        其他object信息，这里省略
	</object>
</annotation>

2.2 VOC转换脚本

下面这个脚本，只适用于有图像和xml文件的情况下，coco转voc格式以后有需要再写：

代码语言：javascript复制

# -*- coding: utf-8 -*-
# @Author    : justlovesmile
# @Date      : 2021/9/8 21:01
import os,random
from tqdm.auto import tqdm
import shutil as sh

def mkdir(path):
    if not os.path.exists(path):
        os.mkdir(path)
        return True
    else:
        print(f"The path ({path}) already exists.")
        return False

def tovoc(xmlroot,imgroot,saveroot,errorId=[],classes={},tvp=1.0,trp=0.9):
    '''
    参数：
        root：数据集存放根目录
    功能：
        加载数据，并保存为VOC格式
    加载后的格式：
    VOC/
      Annotations/
        - **.xml
      JPEGImages/
        - **.jpg
      ImageSets/
        Main/
          - train.txt
          - test.txt
          - val.txt
          - trainval.txt
    '''
    # assert
    assert len(classes)>0
    # init path
    VOC = saveroot
    ann_path = os.path.join(VOC, 'Annotations')
    img_path = os.path.join(VOC,'JPEGImages')
    set_path = os.path.join(VOC,'ImageSets')
    txt_path = os.path.join(set_path,'Main')
    # mkdirs 
    if mkdir(VOC):
        if mkdir(ann_path) and mkdir(img_path) and mkdir(set_path):
            mkdir(txt_path)

    images = os.listdir(imgroot)
    list_index = range(len(images))
    #test and trainval set
    trainval_percent = tvp
    train_percent = trp
    val_percent = 1 - train_percent if train_percent<1 else 0.1
    total_num = len(images)
    trainval_num = int(total_num*trainval_percent)
    train_num = int(trainval_num*train_percent)
    val_num = int(trainval_num*val_percent) if train_percent<1 else 0

    trainval = random.sample(list_index,trainval_num)
    train = random.sample(list_index,train_num)
    val = random.sample(list_index,val_num)
    
    for i in tqdm(list_index):
        imgfile = images[i]
        img_id = os.path.splitext(os.path.basename(imgfile))[0]
        xmlfile = img_id ".xml"
        sh.copy(os.path.join(imgroot,imgfile),os.path.join(img_path,imgfile))
        sh.copy(os.path.join(xmlroot,xmlfile),os.path.join(ann_path,xmlfile))
        if img_id not in errorId:
            if i in trainval:
                with open(os.path.join(txt_path,'trainval.txt'),'a') as f:
                    f.write(img_id 'n')
                if i in train:
                    with open(os.path.join(txt_path,'train.txt'),'a') as f:
                        f.write(img_id 'n')
                else:
                    with open(os.path.join(txt_path,'val.txt'),'a') as f:
                        f.write(img_id 'n')
                if train_percent==1 and i in val:
                    with open(os.path.join(txt_path,'val.txt'),'a') as f:
                        f.write(img_id 'n')          
            else:
                with open(os.path.join(txt_path,'test.txt'),'a') as f:
                    f.write(img_id 'n')
    
    # end
    print("Dataset to VOC format finished!")

def test():
    box_root = "E:/MyProject/Dataset/hwtest/annotations"
    image_root = "E:/MyProject/Dataset/hwtest/images"
    output_root = "E:/MyProject/Dataset/voc"
    classes = {"liner": 0,"bulk carrier": 1,"warship": 2,"sailboat": 3,"canoe": 4,"container ship": 5,"fishing boat": 6}
    errorId = []
    train_percent = 0.9
    tovoc(box_root,image_root,output_root,errorId,classes,trp=train_percent)

if __name__ == "__main__":
    test()

3. YOLO

3.1 YOLO数据集格式

YOLO数据集格式的出现主要是为了训练模型，其文件格式没有固定的要求，因为可以通过修改模型的配置文件进行数据加载，唯一需要注意的是数据集的标注格式是将目标框的位置信息进行归一化处理（此处归一化指的是除以图片宽和高），如下所示：YOLOYOLO

代码语言：javascript复制

{目标类别} {归一化后的目标中心点x坐标} {归一化后的目标中心点y坐标} {归一化后的目标框宽度w} {归一化后的目标框高度h}

3.2 YOLO转换脚本

Python转换脚本如下所示：

代码语言：javascript复制

# -*- coding: utf-8 -*-
# @Author    : justlovesmile
# @Date      : 2021/9/8 20:28
import os
import random
from tqdm.auto import tqdm
import shutil as sh
try:
    import xml.etree.cElementTree as et
except ImportError:
    import xml.etree.ElementTree as et

def mkdir(path):
    if not os.path.exists(path):
        os.makedirs(path)
        return True
    else:
        print(f"The path ({path}) already exists.")
        return False  

def xml2yolo(xmlpath,savepath,classes={}):
    namemap = classes
    #try:
    #    with open('classes_yolo.json','r') as f:
    #        namemap=json.load(f)
    #except:
    #    pass
    rt = et.parse(xmlpath).getroot()
    w = int(rt.find("size").find("width").text)
    h = int(rt.find("size").find("height").text)
    with open(savepath, "w") as f:
        for obj in rt.findall("object"):
            name = obj.find("name").text
            xmin = int(obj.find("bndbox").find("xmin").text)
            ymin = int(obj.find("bndbox").find("ymin").text)
            xmax = int(obj.find("bndbox").find("xmax").text)
            ymax = int(obj.find("bndbox").find("ymax").text)
            f.write(
                f"{namemap[name]} {(xmin xmax)/w/2.} {(ymin ymax)/h/2.} {(xmax-xmin)/w} {(ymax-ymin)/h}"
                  "n"
            )

def trainval(xmlroot,imgroot,saveroot,errorId=[],classes={},tvp=1.0,trp=0.9):
    # assert
    assert tvp<=1.0 and trp <=1.0 and len(classes)>0
    # create dirs
    imglabel = ['images','labels']
    trainvaltest = ['train','val','test']
    mkdir(saveroot)
    for r in imglabel:
        mkdir(os.path.join(saveroot,r))
        for s in trainvaltest:
            mkdir(os.path.join(saveroot,r,s))
    #train / val
    trainval_percent = tvp
    train_percent = trp
    val_percent = 1 - train_percent if train_percent<1.0 else 0.15
    
    total_img = os.listdir(imgroot)
    num = len(total_img)
    list_index = range(num)
    tv = int(num * trainval_percent)
    tr = int(tv * train_percent)
    va = int(tv * val_percent)
    trainval = random.sample(list_index, tv) # trainset and valset
    train = random.sample(trainval, tr) # trainset
    val = random.sample(trainval, va) #valset, use it only when train_percent = 1 

    print(f"trainval_percent:{trainval_percent},train_percent:{train_percent},val_percent:{val_percent}")
    for i in tqdm(list_index):
        name = total_img[i]
        op = os.path.join(imgroot,name)
        file_id = os.path.splitext(os.path.basename(name))[0]
        if file_id not in errorId:
            xmlp = os.path.join(xmlroot,file_id '.xml')
            if i in trainval:
                # trainset and valset
                if i in train:
                    sp = os.path.join(saveroot,"images","train",name)
                    xml2yolo(xmlp,os.path.join(saveroot,"labels","train",file_id '.txt'),classes)
                    sh.copy(op,sp)
                else:
                    sp = os.path.join(saveroot,"images","val",name)
                    xml2yolo(xmlp,os.path.join(saveroot,"labels","val",file_id '.txt'),classes)
                    sh.copy(op,sp)
                if (train_percent==1.0 and i in val):
                    sp = os.path.join(saveroot,"images","val",name)
                    xml2yolo(xmlp,os.path.join(saveroot,"labels","val",file_id '.txt'),classes)
                    sh.copy(op,sp)
            else:
                # testset
                sp = os.path.join(saveroot,"images","test",name)
                xml2yolo(xmlp,os.path.join(saveroot,"labels","test",file_id '.txt'),classes)
                sh.copy(op,sp)

def maketxt(dir,saveroot,filename):
    savetxt = os.path.join(saveroot,filename)
    with open(savetxt,'w') as f:
        for i in tqdm(os.listdir(dir)):
            f.write(os.path.join(dir,i) 'n')
                           
def toyolo(xmlroot,imgroot,saveroot,errorId=[],classes={},tvp=1,train_percent=0.9):
    # toyolo main function
    trainval(xmlroot,imgroot,saveroot,errorId,classes,tvp,train_percent)
    maketxt(os.path.join(saveroot,"images","train"),saveroot,"train.txt")
    maketxt(os.path.join(saveroot,"images","val"),saveroot,"val.txt")
    maketxt(os.path.join(saveroot,"images","test"),saveroot,"test.txt")
    print("Dataset to yolo format success.")

def test():
    box_root = "E:/MyProject/Dataset/hwtest/annotations"
    image_root = "E:/MyProject/Dataset/hwtest/images"
    output_root = "E:/MyProject/Dataset/yolo"
    classes = {"liner": 0,"bulk carrier": 1,"warship": 2,"sailboat": 3,"canoe": 4,"container ship": 5,"fishing boat": 6}
    errorId = []
    train_percent = 0.9
    toyolo(box_root,image_root,output_root,errorId,classes,train_percent=train_percent)

if __name__ == "__main__":
    test()

按照此脚本，将会在输出文件夹中生成以下内容：

代码语言：javascript复制

-yolo/
	|-images/
		|-train/
			|-1.jpg
			|-2.jpg
		|-test/
			|-3.jpg
			|-4.jpg
		|-val/
			|-5.jpg
			|-6.jpg
	|-labels/
		|-train/
			|-1.txt
			|-2.txt
		|-test/
			|-3.txt
			|-4.txt
		|-val/
			|-5.txt
			|-6.txt
	|-train.txt
	|-test.txt
	|-val.txt

图像识别神经网络深度学习人工智能 xml

0 人点赞