只要给定每一类图像所在的文件夹的路径,即可完成全部的图片的加载:
例如:
但是问题是,如何划分训练集、测试集呢,使他们分别被加载!代码如下。
主要思想就是根据官方提供的数据集划分方式,使用shutil.copyfile()函数把训练集、测试集分别拷贝到不同的目录下!
import os
import numpy as np
import shutil
# divivd dataset (without annotations)
img_dir = 'data/birds/'
save_dir = 'data/Birds/'
if not os.path.exists(save_dir):
os.mkdir(save_dir)
save_dir_train = os.path.join(save_dir, 'train')
if not os.path.exists(save_dir_train):
os.mkdir(save_dir_train)
save_dir_test = os.path.join(save_dir, 'test')
if not os.path.exists(save_dir_test):
os.mkdir(save_dir_test)
f2 = open(os.path.join(img_dir, "images.txt"))
foo = f2.readlines()
f = open(os.path.join(img_dir, "train_test_split.txt"))
bar = f.readlines()
f3 = open(os.path.join(img_dir, "image_class_labels.txt"))
baz = f3.readlines()
for i in range(len(foo)):
image_id = foo[i].split(" ")[0]
image_path = foo[i].split(" ")[1][:-1]
image_name = image_path.split("/")[1]
is_train = int(bar[i].split(" ")[1][:-1])
classes = baz[i].split(" ")[1][:-1]
# split train & test data
if is_train:
# make class dir
try:
os.mkdir(os.path.join(save_dir_train, classes))
except:
print("file already exists")
src_path = os.path.join(img_dir, 'images', image_path)
dst_path = os.path.join(save_dir_train, classes, image_name)
else:
# make class dir
try:
os.mkdir(os.path.join(save_dir_test, classes))
except:
print("file already exists")
src_path = os.path.join(img_dir, 'images', image_path)
dst_path = os.path.join(save_dir_test, classes, image_name)
shutil.copyfile(src_path, dst_path)
print("src:", src_path, "dst:", dst_path)
print(dataset.class_to_idx)
输出每一类对应的标签(即One-hot向量),发现是无序的,很不方便,于是保留原来文件夹的命名方式:
修改后的代码如下:
import os
import numpy as np
import shutil
# divivd dataset (without annotations)
img_dir = 'data/birds/'
save_dir = 'data/Birds_new/'
if not os.path.exists(save_dir):
os.mkdir(save_dir)
save_dir_train = os.path.join(save_dir, 'train')
if not os.path.exists(save_dir_train):
os.mkdir(save_dir_train)
save_dir_test = os.path.join(save_dir, 'test')
if not os.path.exists(save_dir_test):
os.mkdir(save_dir_test)
f2 = open(os.path.join(img_dir, "images.txt"))
foo = f2.readlines()
f = open(os.path.join(img_dir, "train_test_split.txt"))
bar = f.readlines()
f3 = open(os.path.join(img_dir, "image_class_labels.txt"))
baz = f3.readlines()
for i in range(len(foo)):
image_id = foo[i].split(" ")[0]
image_path = foo[i].split(" ")[1][:-1]
image_name = image_path.split("/")[1]
is_train = int(bar[i].split(" ")[1][:-1])
classes = baz[i].split(" ")[1][:-1]
# split train & test data
if is_train:
# make class dir
try:
os.mkdir(os.path.join(save_dir_train, image_path.split("/")[0]))
except:
print("file already exists")
src_path = os.path.join(img_dir, 'images', image_path)
dst_path = os.path.join(save_dir_train, image_path.split("/")[0], image_name)
else:
# make class dir
try:
os.mkdir(os.path.join(save_dir_test, image_path.split("/")[0]))
except:
print("file already exists")
src_path = os.path.join(img_dir, 'images', image_path)
dst_path = os.path.join(save_dir_test, image_path.split("/")[0], image_name)
shutil.copyfile(src_path, dst_path)
print("src:", src_path, "dst:", dst_path)