Python训练营-Day20

import pandas as pd
import pandas as pd    
import numpy as np     
import matplotlib.pyplot as plt    
import seaborn as sns   
import warnings
warnings.filterwarnings("ignore")
 
 
plt.rcParams['font.sans-serif'] = ['SimHei']  
plt.rcParams['axes.unicode_minus'] = False    
data = pd.read_csv('heart.csv')    
from sklearn.preprocessing import StandardScaler
# 数据标准化
scaler = StandardScaler()
 
from sklearn.model_selection import train_test_split
X = data.drop(['target'], axis=1)  
y = data['target'] # 标签
# 按照8:2划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)  # 80%训练集,20%测试集
from sklearn.ensemble import RandomForestClassifier #随机森林分类器
 
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score # 用于评估分类器性能的指标
from sklearn.metrics import classification_report, confusion_matrix #用于生成分类报告和混淆矩阵
import warnings #用于忽略警告信息
warnings.filterwarnings("ignore") # 忽略所有警告信息
# --- 1. 默认参数的随机森林 ---
# 评估基准模型,这里确实不需要验证集
print("--- 1. 默认参数随机森林 (训练集 -> 测试集) ---")
import time # 这里介绍一个新的库,time库,主要用于时间相关的操作,因为调参需要很长时间,记录下会帮助后人知道大概的时长
start_time = time.time() # 记录开始时间
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train) # 在训练集上训练
rf_pred = rf_model.predict(X_test) # 在测试集上预测
end_time = time.time() # 记录结束时间
 
print(f"训练与预测耗时: {end_time - start_time:.4f} 秒")
print("\n默认随机森林 在测试集上的分类报告:")
print(classification_report(y_test, rf_pred))
print("默认随机森林 在测试集上的混淆矩阵:")
print(confusion_matrix(y_test, rf_pred))
 
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
 
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)  # 对训练集进行标准化
X_test = scaler.transform(X_test)  # 对测试集进行标准化
print(f"训练集形状: {X_train.shape}")
print(f"测试集形状: {X_test.shape}")
 
# 对训练集进行 SVD 分解
U_train, sigma_train, Vt_train = np.linalg.svd(X_train, full_matrices=False)
print(f"Vt_train 矩阵形状: {Vt_train.shape}")
 
# 选择保留的奇异值数量 k
k = 9
Vt_k = Vt_train[:k, :]  
print(f"保留 k={k} 后的 Vt_k 矩阵形状: {Vt_k.shape}")
 
 
X_train_reduced = X_train @ Vt_k.T
print(f"降维后训练集形状: {X_train_reduced.shape}")
 
 
X_test_reduced = X_test @ Vt_k.T
print(f"降维后测试集形状: {X_test_reduced.shape}")
 
 
model = LogisticRegression(random_state=42)
model.fit(X_train_reduced, y_train)
 
# 预测并评估
y_pred = model.predict(X_test_reduced)
accuracy = accuracy_score(y_test, y_pred)
print(f"降维后测试集上的准确率: {accuracy:.4f}")
 
# 计算训练集的近似误差(可选,仅用于评估降维效果)
X_train_approx = U_train[:, :k] @ np.diag(sigma_train[:k]) @ Vt_k
error = np.linalg.norm(X_train - X_train_approx, 'fro') / np.linalg.norm(X_train, 'fro')
print(f"训练集近似误差 (Frobenius 范数相对误差): {error}")
print(f"训练与预测耗时: {end_time - start_time:.4f} 秒")
print("\n奇异解后随机森林 在测试集上的分类报告:")
print(classification_report(y_test, y_pred))
print("奇异解后随机森林 在测试集上的混淆矩阵:")
print(confusion_matrix(y_test, y_pred))

@浙大疏锦行

你可能感兴趣的:(Python训练营-Day20)