coco数据集中筛选出特定类别，并记录jpg文件名称

时间：2023-06-02

代码1

coco数据集过大，希望从里面挑选特定的类别，来构成测试集。

以下cocodeal.py文件能够保留coco中指定的类别数据，同时生成.xml文件。具体应用可以见代码注释。

#cocodeal.py#本文件处理coco数据集，删除指定类别数据。要处理的目录结构如下。最后生成Annotations和images备份文件。#并且在images_coco文件夹中是我们想要的。# 如下是必须需要的目录或文件，其中annotations存着.xml文件，images下的train2014和val2014存着.jpg文件，labels下的train2014和val2014存着.txt文件信息# ├── coco# │ ├── annotations # 里面是*.xml# │ ├── images# │ │ ├── train2014 # 里面是*.jpg# │ │ └── val2014 # 里面是*.jpg# │ └── labels # │ ├── train2014 # 里面是*.txt # │ └── val2014 # 里面是*.txt# |—— cocodeal.py # 本文件#生成annotations_filtered和images_filtered文件夹from pycocotools.coco import COCOimport osimport shutilfrom tqdm import tqdmimport skimage.io as ioimport matplotlib.pyplot as pltimport cv2from PIL import Image, ImageDraw, ImageFileImageFile.LOAD_TRUNCATED_IMAGES = True#if the dir is not exists,make it,else delete itdef mkr(path): if os.path.exists(path): shutil.rmtree(path) os.mkdir(path) else: os.mkdir(path)###########################修改以下参数适配路径#the path you want to save your results for coco to vocsavepath=""# 保存的路径img_dir = savepath+'images_filtered/'# 保存图片的文件夹路径mkr(img_dir)anno_dir = savepath+'annotations_filtered/'# 保存xml的文件夹路径mkr(anno_dir)datasets_list=['train2014', 'val2014'] # 与 coco/images里的两个文件夹名一致# datasets_list=['train2014']# datasets_list = ['val2017']# 你需要挑出的类的名称classes_names = ["elephant"] # 要保留的类别list#Store annotations and train2014/val2014/..、in this folderdataDir = 'coco'# coco数据集所在的位置。本文件与该文件夹在同一级目录下###############################headstr = """ VOC %s My Database COCO flickr NULL %d %d %d 0"""objstr = """ """tailstr = ''''''def id2name(coco): classes=dict() for cls in coco.dataset['categories']: classes[cls['id']]=cls['name'] return classesdef write_xml(anno_path,head, objs, tail): f = open(anno_path, "w") f.write(head) for obj in objs: f.write(objstr%(obj[0],obj[1],obj[2],obj[3],obj[4])) f.write(tail) f.close()# 根据.jpg文件名，生成对应的.xml文件，并且把.jpg文件复制到dst_imgpath目录下def save_annotations_and_imgs(coco,dataset,filename,objs): #eg:COCO_train2014_000000196610.jpg-->COCO_train2014_000000196610.xml anno_path=anno_dir+filename[:-3]+'xml' img_path=dataDir+'/images/'+dataset+'/'+filename # print(img_path) dst_imgpath=img_dir+dataset+"/"+filename img=cv2.imread(img_path) # print(img) if (img.shape[2] == 1): print(filename + " not a RGB image") return shutil.copy(img_path, dst_imgpath) #复制 head=headstr % (filename, img.shape[1], img.shape[0], img.shape[2]) tail = tailstr write_xml(anno_path,head, objs, tail)def showimg(coco,dataset,img,classes,cls_id,show=True): global dataDir if show: I=Image.open('%s/%s/%s/%s'%(dataDir,'images',dataset,img['file_name'])) #Get the annotated information by ID annIds = coco.getAnnIds(imgIds=img['id'], catIds=cls_id, iscrowd=None) # print(annIds) anns = coco.loadAnns(annIds) # print(anns) # coco.showAnns(anns) objs = [] for ann in anns: class_name=classes[ann['category_id']] if class_name in classes_names: # print(class_name) if 'bbox' in ann: bbox=ann['bbox'] xmin = int(bbox[0]) ymin = int(bbox[1]) xmax = int(bbox[2] + bbox[0]) ymax = int(bbox[3] + bbox[1]) obj = [class_name, xmin, ymin, xmax, ymax] objs.append(obj) if show: draw = ImageDraw.Draw(I) draw.rectangle([xmin, ymin, xmax, ymax]) if show: plt.figure() plt.axis('off') plt.imshow(I) plt.show() return objs# myfile = open("imageID.txt", "w+")for dataset in datasets_list: mkr(img_dir+dataset) #./COCO/annotations/instances_train2014.json # .json文件所在的位置 annFile='{}/annotations/instances_{}.json'.format(dataDir,dataset) #COCO API for initializing annotated data coco = COCO(annFile) ''' When the COCO object is created, the following information will be output: loading annotations into memory... Done (t=0.81s) creating index... index created! So far, the JSON script has been parsed and the images are associated with the corresponding annotated data. ''' #show all classes in coco classes = id2name(coco) print(classes) #[1, 2, 3, 4, 6, 8] classes_ids = coco.getCatIds(catNms=classes_names) print(classes_ids) # exit() for cls in classes_names: #Get ID number of this class cls_id=coco.getCatIds(catNms=[cls]) img_ids=coco.getImgIds(catIds=cls_id) print(cls,len(img_ids)) # imgIds=img_ids[0:10] for imgId in tqdm(img_ids): # print(imgId, file=myfile) img = coco.loadImgs(imgId)[0] filename = img['file_name'] # print(filename) objs=showimg(coco, dataset, img, classes,classes_ids,show=False) # print(objs) save_annotations_and_imgs(coco, dataset, filename, objs)# myfile.close()

代码2

通过上面代码能够筛选出想要的种类的数据集。如果觉得数据集过大，或者想要生成这些图片文件的名称（不含.jpg），可以通过以下代码来实现。

#getImageID.py#本文件将提前取出前selct_n张train图片和val图片，并且提取出对应的annotations里的.xml文件，同时把文件名称（不含.jpg）存于文本文件中#与cocodeal.py在同一目录，先运行cocodeal.py，再手动检查前selct_n张图片的内容，再运行本文件#生成的文件存于同目录下的myimages和myannotations文件夹。以及train2014imgid.txt,val2014imgid.txt。#有需要的话自己手动修改文件夹名称。可以先手动筛过images_filtered里前selct_n张图片再运行本文件。# .# ├── annotations_filtered 需要的文件夹# ├── coco# ├── cocodeal.py# ├── getImageID.py 本文件# ├── images_filtered 需要的文件夹# ├── myannotations 生成的文件夹# ├── myimages 生成的文件夹 # ├── train2014imgid.txt 生成的文件# ├── val2014imgid.txt 生成的文件import osimport shutilselct_n = 200 # 挑选出200张jpg图片#if the dir is not exists,make it,else delete itdef mkr(path): if os.path.exists(path): shutil.rmtree(path) os.mkdir(path) else: os.mkdir(path)img_src = "images_filtered/" # 从该路径下的train2014文件夹与val2014文件夹里面选出照片anno_src = "annotations_filtered/"img_path = "myimages/"anno_path = "myannotations/"mkr(img_path) # 挑出来的图片保存在该文件夹下mkr(anno_path) # 挑出来的.xml文件保存在该文件夹下sets = ['train2014', 'val2014'] # train的图片存放与myimages/train2014/文件夹下，val放在myimages/val2014/下for x in sets: cnt = 0 myfile = open(x+"imgid.txt", "w+") # 覆盖写。存储图片的名称（不含.jpg） mkr(img_path+x) # myimages/train2014/ 或 myimages/val2014/ for dirpath, dirnames, filenames in os.walk(img_src+x): # print(filenames) print(dirnames) print(dirpath) for filename in filenames: cnt+=1 if cnt > selct_n: break print(filename[:-4], file=myfile) shutil.copy(src=dirpath+"/"+filename, dst=img_path+x+"/"+filename) shutil.copy(src=anno_src+filename[:-4]+".xml", dst=anno_path+filename[:-4]+".xml") myfile.close()

上一篇：试题基础练习序列求和

下一篇：python中random.random用法