因为我刚开始是想利用与训练的模型VIT-B/32等来得到图像和文本的编码,然后用一些机器学习的模型训练分类的。但是我训练的最高的正确率(每个类训练就使用4张图片)也就在
62%
左右,所以准备换思路了。这个代码也就没啥用了
因为有比赛要求所以每个类就选了4张图片作为训练集,具体选择使用了
KMeans
聚类,所以如果想要直接应用的话使用全部的训练集应该效果会提升不少
代码解释是
AI
生成的,或者你自己找ai问
反正就是,这个代码整体就是做了特征提取和测试的工作,然后可以直接再写几行代码用
sklearn
库来训练模型
这里得装cuda,装显卡驱动,网上都有,以及pytorch库,可能遇到的问题有个.so文件版本不匹配的问题,直接把那个版本不匹配的.so删了就可,我找到的原因就是用pip安装pytorch的时候会带一些cuda的库,这些库装cuda的时候都装上去了,然后就是pytorch里边想调的版本和cuda用的版本不一样,只要删了就会自动调用系统的
另外,有人想搞这个比赛的可以联系我邮箱[email protected]或者私信,队伍还有一个名额
import torch
from torchvision import transforms
from PIL import Image
import clip
import os
from tqdm import tqdm
import argparse
from sklearn.svm import SVC
from sklearn.cluster import KMeans
import numpy as np
from sklearn.linear_model import LogisticRegression
from datetime import datetime
class PreModel():
def __init__(self) -> None:
self.name="none"
pass
class VIT_B(PreModel):
def __init__(self) -> None:
super()
self.name="VIT-B"
self.device="cuda:0"
self.device_cpu="cpu"
self.model,self.preprocess=clip.load("./ViT-B-32.pt", device=self.device)
print("初始化模型VIT-B")
def extract_text_feature(self,text):
with torch.no_grad():
text=text.to(self.device)
return self.model.encode_text(text).cpu().numpy()
def extract_image_feature(self,image_path):
with torch.no_grad():
image=Image.open(image_path)
image=self.preprocess(image).unsqueeze(0).to(self.device)
image_feature=self.model.encode_image(image)
image_feature/=image_feature.norm(dim=-1,keepdim=True)
return image_feature.cpu().numpy()[0]
class RN101(PreModel):
def __init__(self):
super().__init__()
self.model,self.preprocess=clip.load('./mine/RN101/RN101.pt')
print("初始化模型RN-101")
def extract_text_feature(self,text):
with torch.no_grad():
text=text.to(self.device)
return self.model.encode_text(text).cpu().numpy()
def extract_image_feature(self,image_path):
with torch.no_grad():
image=Image.open(image_path)
image=self.preprocess(image).unsqueeze(0).to(self.device)
image_feature=self.model.encode_image(image)
image_feature=image_feature.norm(dim=-1,keepdim=True)
return image.cpu().numpy()
class Data():
def __init__(self,train_data_index_path:str,train_data_path:str,class_labels:str,test_data_path) -> None:#保存形式均为tensor张量
#读取训练数据信息
with open(train_data_index_path,'r')as f:
tmp=f.readlines()
self.train_data_path=train_data_path
self.train_img_names=np.array([i.split(' ')[0] for i in tmp])#训练数据图片名称
self.train_img_labels=np.array([int(i.split(' ')[1][:-1]) for i in tmp])#训练数据标签
self.test_data_path=test_data_path
self.test_img_names=os.listdir(test_data_path)
self.train_nums=len(self.train_img_labels)#总的训练数据的数量
self.class_nums=len(set(self.train_img_labels))
f.close()
#读取类的标签信息(类的自然语言与对应的编号)
with open(class_labels,"r")as f:
tmp=f.readlines()
self.class_labels={i.split(' ')[0]:(int)(i.split(' ')[1]) for i in tmp}
self.labels_class={(int)(i.split(' ')[1]):i.split(' ')[0] for i in tmp}
f.close()
'''
这里使用K-Means++算法选取图片
选择的图片的组织形式为二维数组,每相邻的四个为一类
'''
def extract_train_feature(self,path,model):
print("提取训练集数据特征")
self.train_features_path=path+'/train_features.pt'
#首先检查是否存在已经保存的feature文件
try:
self.train_features=torch.load(self.train_features_path).cpu().numpy()
print("成功直接加载保存的训练数据特征"+str(self.train_features.shape))
except FileNotFoundError:
self.train_features=[]
ans=input("未找到文件,是否重新提取保存?(Y/N)")
if(ans=='Y'):
with torch.no_grad():
for path in tqdm(self.train_img_names):
self.train_features.append(model.extract_image_feature(self.train_data_path+'/'+path))
self.train_features=np.array(self.train_features)
torch.save(torch.Tensor(self.train_features),self.train_features_path)
else:
return
def extract_test_feature(self,path,model):
print("提取测试集特征")
self.test_features_path=path+'/test_features'
try:
self.test_features=torch.load(self.test_features_path).cpu().numpy()
print("成功直接加载保存的测试数据特征")
except FileNotFoundError:
self.test_features=[]
ans=input("未找到文件,是否重新提取保存?(Y/N)")
if(ans=='Y'):
with torch.no_grad():
for path in tqdm(self.test_img_names):
self.test_features.append(model.extract_image_feature(self.test_data_path+'/'+path))
self.test_features=np.array(self.test_features)
torch.save(torch.Tensor(self.test_features),self.test_features_path)
else:
return
def select_train_feature(self,feature_path,label_path):
print("选择训练数据特征")
try:
self.selected_train_feature=torch.load(feature_path+'/selected_train_features.pt').cpu().numpy()
f=open(label_path+'/selected_train_labels.txt','r')
lines=f.readlines()
self.selected_train_labels=np.array([int(line.split(' ')[0]) for line in lines])
self.selected_train_names=np.array([line.split(' ')[1] for line in lines ])
print("成功直接加载选择的训练集特征")
except:
print('总的图片类别数 : '+str(self.class_nums))
self.selected_train_feature=[]
self.selected_train_labels=[]
self.selected_train_names=[]
ind=0
pre_ind=ind
for i in tqdm(range(self.class_nums)):
tmp_class_features=[]
tmp_ind=ind
for u in range(tmp_ind,self.train_img_names.shape[0]):
if(i==self.train_img_labels[ind]):
tmp_class_features.append(self.train_features[ind])
ind+=1
else:
break
tmp_class_features=np.array(tmp_class_features)
#KMeans++选择 random_state设置为0保证结果可重复
k_num=1
kmeans=KMeans(n_clusters=k_num,init='k-means++',random_state=0)
kmeans.fit(tmp_class_features)
tmp_features_num=tmp_class_features.shape[0]
cluster_indices_set=[]
for cen_ind in range(k_num):
cluster_indices=np.where(kmeans.labels_==cen_ind)[0]
cluster_indices_set.append(cluster_indices)
#选出类中心,计算与类中心的平均距离与极值
center=kmeans.cluster_centers_[0]
similarity=[]
for feature in tmp_class_features:
similarity.append(np.sqrt(np.sum((center-feature)**2)))
similarity=np.array(similarity)
similarity_indices=np.argsort(similarity)
similarity_means=np.sum(similarity)/similarity.shape[0]
features=np.array([i for i in similarity_indices if similarity[i]<=similarity_means])
indices=np.random.randint(2,features.shape[0]/2,2)
for k in range(2):#选两个相似度最高的
self.selected_train_feature.append(tmp_class_features[similarity_indices[k]])
self.selected_train_labels.append(self.train_img_labels[ind-1])
self.selected_train_names.append(self.train_img_names[pre_ind+cluster_indices[similarity_indices[k]]])
for k in range(2):
self.selected_train_feature.append(tmp_class_features[indices[k]])
self.selected_train_labels.append(self.train_img_labels[ind-1])
self.selected_train_names.append(self.train_img_names[pre_ind+cluster_indices[indices[k]]])
pre_ind=ind
self.selected_train_names=np.array(self.selected_train_names)
self.selected_train_feature=np.array(self.selected_train_feature)
self.selected_train_labels=np.array(self.selected_train_labels)
print("选取训练数据的形状 : "+str(self.selected_train_feature.shape))
torch.save(torch.Tensor(self.selected_train_feature),feature_path+'/selected_train_features.pt')
with open(label_path+'/selected_train_labels.txt','w')as f:
for label,name in zip(self.selected_train_labels,self.selected_train_names):
f.write(str(label)+' '+str(name)+'\n')
print("保存选取训练数据")
def test_on_train(self,model,data_num=10000):#在训练集上测试,传入模型
selected_indices=np.array([ind for ind in range(self.train_img_names.shape[0]) if(self.train_img_names[ind] in self.selected_train_names)])
valid_indices_mask = np.ones(self.train_features.shape[0], dtype=bool)
for ind in selected_indices:
valid_indices_mask[ind] = False
valid_indices = np.arange(self.train_features.shape[0])[valid_indices_mask]
test_indices = np.random.choice(valid_indices, size=data_num, replace=False)
re=[]
re_labels=[]
for i in test_indices:
re.append(self.train_features[i])
re_labels.append(self.train_img_labels[i])
tmp_test_features=np.array(re)
tmp_test_labels=np.array(re_labels)
error_set=[]
right_num=0
pred_re=model.predict(tmp_test_features)
for ind in range(tmp_test_features.shape[0]):
if(pred_re[ind]==tmp_test_labels[ind]):
right_num+=1
else:
error_set.append(self.train_img_names[test_indices[ind]])
print("测试数据总数 : "+str(data_num))
print("预测正确总数 : "+str(right_num))
print("正确率 : "+str(right_num/data_num*100)+'%')
return right_num/data_num,error_set
def test_submit(self,model):
test_re=model.predict_proba(self.test_features)
now = datetime.now()
time_str = now.strftime('%Y-%m-%d %H:%M:%S')
f=open('result'+time_str+'.txt','w')
for ind in range(test_re.shape[0]):
labels=np.argsort(test_re[ind])[-5:]
f.write(self.test_img_names[ind]+' '+' '.join([str(i) for i in labels])+'\n')
tdip='./Dataset/train.txt'
tdp='./Dataset'
cl='./Dataset/classes.txt'
#一个使用VIT-B模型
vit=VIT_B()
rn=RN101()
#初始化数据类
data_cata='./mine/vib-t/data'
tsdp='./Dataset/TestSetA'
data=Data(tdip,tdp,cl,tsdp)
data.extract_train_feature(path=data_cata,model=vit)
data.select_train_feature(feature_path=data_cata,label_path=data_cata)
torch
,用于深度学习计算;torchvision
处理图像;PIL
,用于图像处理;clip
,处理文本与图像嵌入模型;numpy
,科学计算; -argparse`,命令行参数解析;tqdm
,进度条;os
操作系统;datetime
,日期时间操作。PreModel
类:基类,提供基本结构,含属性name
;VIT_B
类:继承PreModel
,实现特定模型,加载ViT-B模型,用于文本和图像特征提取;RN101
类:同上,加载RN101模型,文本和图像特征提取。Data
类处理数据准备:读取训练、测试数据,包括图像名字、标签;类标签信息;- 提取类标签对应;K-Means++聚类中心;-选择代表性训练数据;-存储处理和标签。extract_text_feature
:模型中,文本提取文本特征;extract_image_feature
,图像特征。extract_train_feature
:读取训练集特征,存在则加载,否则提取;extract_test_feature
:同理测试集。select_train_feature
:读取训练数据,否则按K-Means++选代表性数据。test_on_train
:训练集上测试模型性能,数据数,选择模型,返回准确率及错误样本;(这里AI说错了,因为我选的是没有训练过的数据,给的训练集总共18万多张图片,我选的训练集就1500张左右)test_submit
:预测,概率,取模型,输出结果文件。