基于Openai预训练模型VIT-B的图像分类

基于Openai预训练模型VIT-B的图像分类

前言

因为我刚开始是想利用与训练的模型VIT-B/32等来得到图像和文本的编码,然后用一些机器学习的模型训练分类的。但是我训练的最高的正确率(每个类训练就使用4张图片)也就在62%左右,所以准备换思路了。这个代码也就没啥用了

因为有比赛要求所以每个类就选了4张图片作为训练集,具体选择使用了KMeans聚类,所以如果想要直接应用的话使用全部的训练集应该效果会提升不少

代码解释是AI生成的,或者你自己找ai问

反正就是,这个代码整体就是做了特征提取和测试的工作,然后可以直接再写几行代码用sklearn库来训练模型

这里得装cuda,装显卡驱动,网上都有,以及pytorch库,可能遇到的问题有个.so文件版本不匹配的问题,直接把那个版本不匹配的.so删了就可,我找到的原因就是用pip安装pytorch的时候会带一些cuda的库,这些库装cuda的时候都装上去了,然后就是pytorch里边想调的版本和cuda用的版本不一样,只要删了就会自动调用系统的

另外,有人想搞这个比赛的可以联系我邮箱[email protected]或者私信,队伍还有一个名额

注意里面的路径需要改一下,改成你自己的

训练数据下载链接

  • Trainset
  • classname.txt
  • train.txt

代码

import torch
from torchvision import transforms
from PIL import Image
import clip
import os
from tqdm import tqdm
import argparse
from sklearn.svm import SVC 
from sklearn.cluster import KMeans
import numpy as np
from sklearn.linear_model import LogisticRegression
from datetime import datetime
class PreModel():
    def __init__(self) -> None:
        self.name="none"
        pass
class VIT_B(PreModel):
    def __init__(self) -> None:
        super()
        self.name="VIT-B"
        self.device="cuda:0"
        self.device_cpu="cpu"
        self.model,self.preprocess=clip.load("./ViT-B-32.pt", device=self.device)
        print("初始化模型VIT-B")
    def extract_text_feature(self,text):
        with torch.no_grad():
            text=text.to(self.device)
            return self.model.encode_text(text).cpu().numpy()
    def extract_image_feature(self,image_path):
        with torch.no_grad():
            image=Image.open(image_path)
            image=self.preprocess(image).unsqueeze(0).to(self.device)
            image_feature=self.model.encode_image(image)
            image_feature/=image_feature.norm(dim=-1,keepdim=True)
            return image_feature.cpu().numpy()[0]
class RN101(PreModel):
    def __init__(self):
        super().__init__()
        self.model,self.preprocess=clip.load('./mine/RN101/RN101.pt')
        print("初始化模型RN-101")
    def extract_text_feature(self,text):
        with torch.no_grad():
            text=text.to(self.device)
            return self.model.encode_text(text).cpu().numpy()
    def extract_image_feature(self,image_path):
        with torch.no_grad():
            image=Image.open(image_path)
            image=self.preprocess(image).unsqueeze(0).to(self.device)
            image_feature=self.model.encode_image(image)
            image_feature=image_feature.norm(dim=-1,keepdim=True)
            return image.cpu().numpy()
class Data():
    def __init__(self,train_data_index_path:str,train_data_path:str,class_labels:str,test_data_path) -> None:#保存形式均为tensor张量
        #读取训练数据信息
        with open(train_data_index_path,'r')as f:
            tmp=f.readlines()
            self.train_data_path=train_data_path
            self.train_img_names=np.array([i.split(' ')[0] for i in tmp])#训练数据图片名称
            self.train_img_labels=np.array([int(i.split(' ')[1][:-1]) for i in tmp])#训练数据标签
            self.test_data_path=test_data_path
            self.test_img_names=os.listdir(test_data_path)
            self.train_nums=len(self.train_img_labels)#总的训练数据的数量
            self.class_nums=len(set(self.train_img_labels))
        f.close()
        #读取类的标签信息(类的自然语言与对应的编号)
        with open(class_labels,"r")as f:
            tmp=f.readlines()
            self.class_labels={i.split(' ')[0]:(int)(i.split(' ')[1]) for i in tmp}
            self.labels_class={(int)(i.split(' ')[1]):i.split(' ')[0] for i in tmp}
        f.close()
    '''
    这里使用K-Means++算法选取图片
    选择的图片的组织形式为二维数组,每相邻的四个为一类
    '''
    def extract_train_feature(self,path,model):
        print("提取训练集数据特征")
        self.train_features_path=path+'/train_features.pt'
        #首先检查是否存在已经保存的feature文件
        try:
            self.train_features=torch.load(self.train_features_path).cpu().numpy()
            print("成功直接加载保存的训练数据特征"+str(self.train_features.shape))
        except FileNotFoundError:
            self.train_features=[]
            ans=input("未找到文件,是否重新提取保存?(Y/N)")
            if(ans=='Y'):
                with torch.no_grad():
                    for path in tqdm(self.train_img_names):
                        self.train_features.append(model.extract_image_feature(self.train_data_path+'/'+path))
                    self.train_features=np.array(self.train_features)
                    torch.save(torch.Tensor(self.train_features),self.train_features_path)
            else:
                return
    def extract_test_feature(self,path,model):
        print("提取测试集特征")
        self.test_features_path=path+'/test_features'
        try:
            self.test_features=torch.load(self.test_features_path).cpu().numpy()
            print("成功直接加载保存的测试数据特征")
        except FileNotFoundError:
            self.test_features=[]
            ans=input("未找到文件,是否重新提取保存?(Y/N)")
            if(ans=='Y'):
                with torch.no_grad():
                    for path in tqdm(self.test_img_names):
                        self.test_features.append(model.extract_image_feature(self.test_data_path+'/'+path))
                    self.test_features=np.array(self.test_features)
                    torch.save(torch.Tensor(self.test_features),self.test_features_path)
            else:
                return
    def select_train_feature(self,feature_path,label_path):
        print("选择训练数据特征")
        try:
            self.selected_train_feature=torch.load(feature_path+'/selected_train_features.pt').cpu().numpy()
            f=open(label_path+'/selected_train_labels.txt','r')
            lines=f.readlines()
            self.selected_train_labels=np.array([int(line.split(' ')[0]) for line in lines])
            self.selected_train_names=np.array([line.split(' ')[1] for line in lines ])
            print("成功直接加载选择的训练集特征")
        except:
            print('总的图片类别数 : '+str(self.class_nums))
            self.selected_train_feature=[]
            self.selected_train_labels=[]
            self.selected_train_names=[]
            ind=0
            pre_ind=ind
            for i in tqdm(range(self.class_nums)):
                tmp_class_features=[]
                tmp_ind=ind
                for u in range(tmp_ind,self.train_img_names.shape[0]):
                    if(i==self.train_img_labels[ind]):
                        tmp_class_features.append(self.train_features[ind])
                        ind+=1
                    else:
                        break 
                tmp_class_features=np.array(tmp_class_features)
                #KMeans++选择 random_state设置为0保证结果可重复
                k_num=1
                kmeans=KMeans(n_clusters=k_num,init='k-means++',random_state=0)
                kmeans.fit(tmp_class_features)
                tmp_features_num=tmp_class_features.shape[0]
                cluster_indices_set=[]
                for cen_ind in range(k_num):
                    cluster_indices=np.where(kmeans.labels_==cen_ind)[0]
                    cluster_indices_set.append(cluster_indices)
                #选出类中心,计算与类中心的平均距离与极值
                center=kmeans.cluster_centers_[0]
                similarity=[]
                for feature in tmp_class_features:
                    similarity.append(np.sqrt(np.sum((center-feature)**2)))
                similarity=np.array(similarity)
                similarity_indices=np.argsort(similarity)
                similarity_means=np.sum(similarity)/similarity.shape[0]
                features=np.array([i for i in similarity_indices if similarity[i]<=similarity_means])
                indices=np.random.randint(2,features.shape[0]/2,2)
                for k in range(2):#选两个相似度最高的
                    self.selected_train_feature.append(tmp_class_features[similarity_indices[k]])
                    self.selected_train_labels.append(self.train_img_labels[ind-1])
                    self.selected_train_names.append(self.train_img_names[pre_ind+cluster_indices[similarity_indices[k]]])
                for k in range(2):
                    self.selected_train_feature.append(tmp_class_features[indices[k]])
                    self.selected_train_labels.append(self.train_img_labels[ind-1])
                    self.selected_train_names.append(self.train_img_names[pre_ind+cluster_indices[indices[k]]])
                pre_ind=ind
            self.selected_train_names=np.array(self.selected_train_names)
            self.selected_train_feature=np.array(self.selected_train_feature)
            self.selected_train_labels=np.array(self.selected_train_labels)
            print("选取训练数据的形状 : "+str(self.selected_train_feature.shape))
            torch.save(torch.Tensor(self.selected_train_feature),feature_path+'/selected_train_features.pt')
            with open(label_path+'/selected_train_labels.txt','w')as f:
                for label,name in zip(self.selected_train_labels,self.selected_train_names):
                    f.write(str(label)+' '+str(name)+'\n')
            print("保存选取训练数据")
    def test_on_train(self,model,data_num=10000):#在训练集上测试,传入模型
        selected_indices=np.array([ind for ind in range(self.train_img_names.shape[0]) if(self.train_img_names[ind] in self.selected_train_names)])
        valid_indices_mask = np.ones(self.train_features.shape[0], dtype=bool)
        for ind in selected_indices:
            valid_indices_mask[ind] = False
        valid_indices = np.arange(self.train_features.shape[0])[valid_indices_mask]
        test_indices = np.random.choice(valid_indices, size=data_num, replace=False)
        re=[]
        re_labels=[]
        for i in test_indices:
            re.append(self.train_features[i])
            re_labels.append(self.train_img_labels[i])
        tmp_test_features=np.array(re)
        tmp_test_labels=np.array(re_labels)
        error_set=[]
        right_num=0
        pred_re=model.predict(tmp_test_features)
        for ind in range(tmp_test_features.shape[0]):
            if(pred_re[ind]==tmp_test_labels[ind]):
                right_num+=1
            else:
                error_set.append(self.train_img_names[test_indices[ind]])
        print("测试数据总数 : "+str(data_num))
        print("预测正确总数 : "+str(right_num))
        print("正确率 : "+str(right_num/data_num*100)+'%')
        return right_num/data_num,error_set
    def test_submit(self,model):
        test_re=model.predict_proba(self.test_features)
        now = datetime.now()
        time_str = now.strftime('%Y-%m-%d %H:%M:%S')
        f=open('result'+time_str+'.txt','w')
        for ind in range(test_re.shape[0]):
            labels=np.argsort(test_re[ind])[-5:]
            f.write(self.test_img_names[ind]+' '+' '.join([str(i) for i in labels])+'\n')
tdip='./Dataset/train.txt'
tdp='./Dataset'
cl='./Dataset/classes.txt'
#一个使用VIT-B模型
vit=VIT_B()
rn=RN101()
#初始化数据类
data_cata='./mine/vib-t/data'
tsdp='./Dataset/TestSetA'
data=Data(tdip,tdp,cl,tsdp)
data.extract_train_feature(path=data_cata,model=vit)
data.select_train_feature(feature_path=data_cata,label_path=data_cata)

AI代码解释

  1. 导入库和基础定义
  • 引入torch,用于深度学习计算;
  • torchvision处理图像;
  • PIL,用于图像处理;
  • clip,处理文本与图像嵌入模型;
  • numpy,科学计算; -argparse`,命令行参数解析;
  • tqdm,进度条;
  • os操作系统;
  • datetime,日期时间操作。
  1. 定义基础类
  • PreModel类:基类,提供基本结构,含属性name
  • VIT_B类:继承PreModel,实现特定模型,加载ViT-B模型,用于文本和图像特征提取;
  • RN101类:同上,加载RN101模型,文本和图像特征提取。
  1. 数据处理类
  • Data类处理数据准备:读取训练、测试数据,包括图像名字、标签;类标签信息;- 提取类标签对应;K-Means++聚类中心;-选择代表性训练数据;-存储处理和标签。
  1. 特征提取
  • extract_text_feature:模型中,文本提取文本特征;
  • extract_image_feature,图像特征。
  1. 数据处理
  • extract_train_feature:读取训练集特征,存在则加载,否则提取;
  • extract_test_feature:同理测试集。
  1. 特征选择
  • select_train_feature:读取训练数据,否则按K-Means++选代表性数据。
  1. 测试
  • test_on_train:训练集上测试模型性能,数据数,选择模型,返回准确率及错误样本;(这里AI说错了,因为我选的是没有训练过的数据,给的训练集总共18万多张图片,我选的训练集就1500张左右)
  • test_submit:预测,概率,取模型,输出结果文件。

你可能感兴趣的:(ubuntu,机器学习,深度学习,pytorch,openai,分类)