pytorch实现泰坦尼克号生存预测

导入模块

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing as ps
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset 
from torch.utils.data import DataLoader
#导入数据
first_data = pd.read_csv("train.csv")
print(first_data)

pytorch实现泰坦尼克号生存预测_第1张图片

数据清洗

#数据清洗
print(len(first_data["Name"].unique()) == first_data.shape[0])
first_data =  first_data.drop(["Cabin","Name",'PassengerId','Ticket'],axis = 1)
# 填补缺失值
age = float(int(first_data["Age"].mean())) #均值
embarked = first_data["Embarked"].value_counts().index[0] #众数
first_data.fillna({"Age":age,"Embarked":embarked},inplace=True)
# str--->int
sex = first_data["Sex"].unique().tolist()
emb = first_data["Embarked"].unique().tolist()
first_data["Sex"] = first_data["Sex"].apply(lambda x: sex.index(x)+1) 
first_data["Embarked"] = first_data["Embarked"].apply(lambda x: emb.index(x)+1)
#删除重复值
first_data.drop_duplicates(inplace=True)
#重置索引
first_data.index = range(first_data.shape[0])
#定义训练集与测试集的索引
import random
test_set = set([])
while len(test_set) < 277:
    num = random.randint(0,776)
    test_set.add(num)

test_list = list(test_set)
train_list = [x for x in range(first_data.shape[0]) if x not in test_list]
random.shuffle(train_list) #打乱
print(test_list)
print(train_list)
print(len(test_list),len(train_list))
print(len(test_list) + len(train_list) == first_data.shape[0])
#划分数据与标签
X = first_data[[x for x in first_data.columns if x != "Survived"]]
y = first_data["Survived"]
#标准化处理
X = ps.scale(X)
#划分测试集与训练集
x_train = torch.from_numpy(X[train_list]).type(torch.float32)
y_train = torch.from_numpy(y[train_list].values).type(torch.float32)
x_test = torch.from_numpy(X[test_list]).type(torch.float32)
y_test = torch.from_numpy(y[test_list].values).type(torch.float32)
#初始化参数与数据
batch = 64
device = torch.device("cuda" if torch.cuda.is_available() else "gpu")
train_td = TensorDataset(x_train,y_train)
train_dl = DataLoader(train_td,batch_size=batch,shuffle=False)
test_td = TensorDataset(x_test,y_test)
test_dl = DataLoader(test_td,batch_size=batch,shuffle=False)

设计网络层及训练(损失函数使用BCELoss())

# 设计网络层(BCELoss())
class Net(nn.Module):
    def __init__(self,inp):
        super(Net,self).__init__()
        self.input = nn.Linear(inp,1)
#         self.hidden = nn.Linear(64,64)
#         self.out = nn.Linear(64,1)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self,x):
        x = self.input(x)
#         x = F.relu(x)
#         x = self.hidden(x)
#         x = F.relu(x)
#         x = self.out(x)
        x = self.sigmoid(x)
        
        return x

net1 = Net(x_train.size()[1])
net1.to(device)
optimizer = torch.optim.SGD(net1.parameters(),lr=0.001)
# BCELoss()只能计算一列的值,并且label需要float类型,并且BCELoss()没有sigmoid()函数,设计模型时最后一层需要sigmoid()。
loss_func = nn.BCELoss()

high_acc1 = [] #存储准确率
for i in range(3000):
    #训练
    loss_trains = 0.0
    for indexs,(datas,labels) in enumerate(train_dl):
        datas, labels = datas.to(device), labels.to(device)
        #print(index,label)
        predict = net1(datas)
        loss_train = loss_func(predict.squeeze(),labels) #此时predict为一列
        loss_trains += loss_train.item()
        optimizer.zero_grad()
        loss_train.backward()
        optimizer.step()
        #print(predict,loss_train.item())
        if indexs % 2 == 0:
            print("batch_train_loss:",loss_train.item())
    print("train_loss:{:.4f}".format(loss_trains))
     #验证       
    with torch.no_grad():
        loss_tests = 0.0
        accuracy = 0.0
        for data,label in test_dl:
            data, label = data.to(device), label.to(device)
            out = net1(data)
            loss_test = loss_func(out,label.view_as(out))
            loss_tests += loss_test.item()
            pred = out.gt(0.5).float().squeeze() #gt()判断out中的值是否大于0.5
            accuracy += pred.eq(label).sum().item() #eq()判断两个tensor()中对应位置的值是否相等
        print("epoch:{} test_loss:{:.4f} test_accuracy:{:.4f}".format(i,loss_tests/len(test_dl),accuracy/x_test.size()[0]))
        high_acc1.append(round(accuracy/x_test.size()[0],4)) 

#输出最大准确率
print(max(high_acc1))   

在这里插入图片描述

设计网络层及训练(损失函数使用CrossEntropyLoss())

# 设计网络层(CrossEntropyLoss())
class Net2(nn.Module):
    def __init__(self,inp):
        super(Net2,self).__init__()
        self.input = nn.Linear(inp,2)
#         self.hidden = nn.Linear(64,64)
#         self.out = nn.Linear(64,1)
#         self.log_softmax = nn.LogSoftmax()
        
    def forward(self,x):
        x = self.input(x)
#         x = F.relu(x)
#         x = self.hidden(x)
#         x = F.relu(x)
#         x = self.out(x)
#         x = self.log_softmax(x)
        
        return x

net2 = Net2(x_train.size()[1])
net2.to(device)
# CrossEntropyLoss()可以计算多列的值,并且label需要long类型,并且CrossEntropyLoss()内置了log_softmax()函数,
# 设计模型时最后一层不需要log_softmax()激活函数。
loss_func = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(net2.parameters(),lr=0.001)

high_acc2 = [] #存储准确率
for i in range(3000):
    #训练
    loss_trains = 0.0
    for indexs,(datas,labels) in enumerate(train_dl):
        labels = labels.type(torch.long)
        datas, labels = datas.to(device), labels.to(device)
        #print(index,label)
        predict = net2(datas)
		#print(predict)
        loss_train = loss_func(predict.squeeze(),labels) #此时predict为两列
        loss_trains += loss_train.item()
        optimizer.zero_grad()
        loss_train.backward()
        optimizer.step()
        #print(predict,loss_train.item())
        if indexs % 2 == 0:
            print("batch_train_loss:",loss_train.item())
    print("train_loss:{:.4f}".format(loss_trains))
    #验证         
    with torch.no_grad():
        loss_tests = 0.0
        accuracy = 0.0
        for data,label in test_dl:
            label = label.type(torch.long) #将float类型转换为long类型
            data, label = data.to(device), label.to(device)
            out = net2(data)
            loss_test = loss_func(out.squeeze(),out)
            loss_tests += loss_test.item()
#             soft = F.softmax(out,dim=1)   #可写可不写
            pred = torch.argmax(out,dim=1)
            accuracy += pred.eq(label).sum().item()
        print("epoch:{} test_loss:{:.4f} test_accuracy:{:.4f}".format(i,loss_tests/len(test_dl),accuracy/x_test.size()[0]))
        high_acc2.append(round(accuracy/x_test.size()[0],4))    

#输出最大准确率
print(max(high_acc2))

在这里插入图片描述

你可能感兴趣的:(机器学习,python,pytorch)