超参数调整专题1
知识点回顾
1. 网格搜索
2. 随机搜索(简单介绍,非重点 实战中很少用到,可以不了解)
3. 贝叶斯优化(2种实现逻辑,以及如何避开必须用交叉验证的问题)
4. time库的计时模块,方便后人查看代码运行时长
今日作业:
对于信贷数据的其他模型,如LightGBM和KNN 尝试用下贝叶斯优化和网格搜索
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# 设置中文字体
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
data = pd.read_csv('data.csv')
# 处理分类特征
discrete_features = data.select_dtypes(include=['object']).columns.tolist()
# 对"Home Ownership"进行标签编码
home_ownership_mapping = {'Own Home':1, 'Rent':2, 'Have Mortgage':3, 'Home Mortgage':4}
data['Home Ownership'] = data['Home Ownership'].map(home_ownership_mapping)
# 对"Years in current job"进行标签编码
years_in_job_mapping = {'< 1 year':1, '1 year':2, '2 years':3, '3 years':4,
'4 years':5, '5 years':6, '6 years':7, '7 years':8,
'8 years':9, '9 years':10, '10+ years':11}
data['Years in current job'] = data['Years in current job'].map(years_in_job_mapping)
# 对"Purpose"进行独热编码
data = pd.get_dummies(data, columns=['Purpose'])
data2 = pd.read_csv("data.csv")
list_final = [i for i in data.columns if i not in data2.columns]
for i in list_final:
data[i] = data[i].astype(int)
# 对"Term"进行映射并重命名
term_mapping = {'Short Term':0, 'Long Term':1}
data['Term'] = data['Term'].map(term_mapping)
data.rename(columns={'Term':'Long Term'}, inplace=True)
# 填充连续特征缺失值(用中位数)
continuous_features = data.select_dtypes(include=['int64', 'float64']).columns.tolist()
for feature in continuous_features:
mode_value = data[feature].mode()[0]
data[feature].fillna(mode_value, inplace=True)
基础模型
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import time
print("--- 默认参数随机森林 ---")
start_time = time.time()
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)
end_time = time.time()
print(f"训练耗时: {end_time-start_time:.2f}s")
print(classification_report(y_test, rf_pred))
print("混淆矩阵:\n", confusion_matrix(y_test, rf_pred))
from sklearn.model_selection import GridSearchCV
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [None, 10, 20, 30],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4]
}
print("--- 网格搜索 ---")
grid_search = GridSearchCV(
estimator=RandomForestClassifier(random_state=42),
param_grid=param_grid,
cv=5,
n_jobs=-1,
scoring='accuracy'
)
start_time = time.time()
grid_search.fit(X_train, y_train)
end_time = time.time()
print(f"最佳参数: {grid_search.best_params_}")
best_pred = grid_search.predict(X_test)
print(classification_report(y_test, best_pred))
from skopt import BayesSearchCV
from skopt.space import Integer
search_space = {
'n_estimators': Integer(50, 200),
'max_depth': Integer(10, 30),
'min_samples_split': Integer(2, 10),
'min_samples_leaf': Integer(1, 4)
}
print("--- 贝叶斯优化 ---")
bayes_search = BayesSearchCV(
estimator=RandomForestClassifier(random_state=42),
search_spaces=search_space,
n_iter=32,
cv=5,
n_jobs=-1,
scoring='accuracy'
)
start_time = time.time()
bayes_search.fit(X_train, y_train)
end_time = time.time()
print(f"最佳参数: {bayes_search.best_params_}")
best_pred = bayes_search.predict(X_test)
print(classification_report(y_test, best_pred))
@浙大疏锦行