对于训练集图片分类记录在csv文件中的情况:
有两种处理方式:
一、将训练集图片分为x_train和y_train:
#读入数据
labels=pd.read_csv('labels.csv')
sample_submission = pd.read_csv( 'sample_submission.csv')
#进行one-hot编码,并转为np.array
targets_series=pd.Series(labels['breed'])
one_hot=pd.get_dummies(targets_series,sparse=True)
one_hot_labels = np.asarray(one_hot)
#用opencv读取图片
img_size=299
x_train=[]
y_train=[]
x_test=[]
i=0
for f,breed in tqdm(labels.values):
img=cv2.imread('train/{}.jpg'.format(f))
label=one_hot_labels[i]
x_train.append(cv2.resize(img,(img_size,img_size)))
y_train.append(label)
i+=1
for f in tqdm(sample_submission['id'].values):
img=cv2.imread('test/{}.jpg'.format(f))
x_test.append(cv2.resize(img,(img_size,img_size)))
#将图片处理为已有模型所需的输入
y_train_raw = np.array(y_train, np.uint8)
x_train_raw = np.array(x_train, np.float32) / 255.
x_test = np.array(x_test, np.float32) / 255.
此方式可以直接用fit方法训练
history = model.fit(x_train_raw,
y_train_raw,
batch_size=16,
epochs=20,
shuffle=True,
validation_split=0.1)
二、将训练集图像分成训练集和开发集,按类别保存到不同的文件夹
#读取label文件
labels_csv = pd.read_csv("labels.csv")
#获取所有类别和文件名
breeds = pd.Series(labels_csv['breed'])
filenames = pd.Series(labels_csv['id'])
#将文件名与类别对应并进行one-hot编码
unique_breeds = np.unique(breeds)
labels = []
for breed in breeds:
i = np.where(unique_breeds == breed)[0][0]
labels.append(i)
n_breeds = np.max(labels) + 1
labels = np.eye(n_breeds)[labels]
np.where的用法
#将对应的图片拷贝到对应类的文件夹
import shutil
filenames_train = []
filenames_validate = []
# move to validate folder
for i in tqdm(range(len(filenames))):
label = unique_breeds[np.where(labels[i]==1.)][0]
filename = '{}.jpg'.format(filenames[i])
if i < 8000:
new_dir = './sorted/train/{}/'.format(label)
filenames_train.append(new_dir + filename)
else:
new_dir = './sorted/validate/{}/'.format(label)
filenames_validate.append(new_dir + filename)
if not exists(new_dir):
makedirs(new_dir)
shutil.copy("train/{}.jpg".format(filenames[i]), new_dir + filename)
shuitil的用法
此方法直接用可以用ImageDataGenerator来产生训练集和开发集来训练和测试
train_dir="sorted/train/"
test_dir="sorted/validate/"
height=299
width=299
channels=3
batch_size=16
seed=1337
# Training generator
train_datagen = ImageDataGenerator(
rotation_range = 30,
rescale=1. / 255,
shear_range=0.2,
zoom_range=0.2,
horizontal_flip=True)
train_generator = train_datagen.flow_from_directory(train_dir,
target_size=(height,width),
batch_size=batch_size,
seed=seed,
class_mode='categorical')
# Test generator
test_datagen = ImageDataGenerator(
rescale=1./255)
test_generator = test_datagen.flow_from_directory(test_dir,
target_size=(height,width),
batch_size=batch_size,
seed=seed,
class_mode='categorical')