python打卡训练营学习记录Day11

超参数调整专题1

知识点回顾

  1. 网格搜索
  2. 随机搜索(简单介绍,非重点 实战中很少用到,可以不了解)
  3. 贝叶斯优化(2种实现逻辑,以及如何避开必须用交叉验证的问题)
  4. time库的计时模块,方便后人查看代码运行时长
  5. import pandas as pd
    import pandas as pd    
    import numpy as np     
    import matplotlib.pyplot as plt    
    import seaborn as sns  
    plt.rcParams['font.sans-serif'] = ['SimHei']  
    plt.rcParams['axes.unicode_minus'] = False   
    data = pd.read_csv('data.csv')    
    discrete_features = data.select_dtypes(include=['object']).columns.tolist()
    home_ownership_mapping = {
        'Own Home': 1,
        'Rent': 2,
        'Have Mortgage': 3,
        'Home Mortgage': 4
    }
    years_in_job_mapping = {
        '< 1 year': 1,
        '1 year': 2,
        '2 years': 3,
        '3 years': 4,
        '4 years': 5,
        '5 years': 6,
        '6 years': 7,
        '7 years': 8,
        '8 years': 9,
        '9 years': 10,
        '10+ years': 11
    }
    term_mapping = {
        'Short Term': 0,
        'Long Term': 1
    }
    data['Home Ownership'] = data['Home Ownership'].map(home_ownership_mapping)
    data['Years in current job'] = data['Years in current job'].map(years_in_job_mapping)
    data['Term'] = data['Term'].map(term_mapping)
    data.rename(columns={'Term': 'Long Term'}, inplace=True)
    data = pd.get_dummies(data, columns=['Purpose'])
    data2 = pd.read_csv("data.csv") 
    list_final = []
    for i in data.columns:
        if i not in data2.columns:
           list_final.append(i) 
    for i in list_final:
        data[i] = data[i].astype(int) 
    continuous_features = data.select_dtypes(include=['int64', 'float64']).columns.tolist()  
    for feature in continuous_features:     
        mode_value = data[feature].mode()[0]            
        data[feature].fillna(mode_value, inplace=True)        
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import classification_report, confusion_matrix
    import lightgbm as lgb
    import warnings
    warnings.filterwarnings("ignore")
    X = data.drop(['Credit Default'], axis=1)  
    y = data['Credit Default'] 
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 
    print("--- 1. 默认参数LightGBM (训练集 -> 测试集) ---")
    import time
    start_time = time.time() 
    lgb_model=lgb.LGBMClassifier(random_state=42) 
    lgb_model.fit(X_train, y_train)
    lgb_pred = lgb_model.predict(X_test)
    end_time = time.time()
    print(f"训练与预测耗时: {end_time - start_time:.4f} 秒")
    print("\n默认LightGBM 在测试集上的分类报告:")
    print(classification_report(y_test, lgb_pred))
    print("默认LightGBM 在测试集上的混淆矩阵:")
    print(confusion_matrix(y_test, lgb_pred))
    print("\n--- 2. 网格搜索优化LightGBM (训练集 -> 测试集) ---")
    from sklearn.model_selection import GridSearchCV
    param_grid = {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20, 30],
        'learning_rate': [0.01, 0.1, 0.2],
        'reg_alpha': [0, 0.1, 0.5],
        'reg_lambda': [0, 0.1, 0.5] 
    }
    grid_search = GridSearchCV(estimator=lgb.LGBMClassifier(random_state=42) , 
                               param_grid=param_grid, # 参数网格
                               cv=5, # 5折交叉验证
                               n_jobs=-1, # 使用所有可用的CPU核心进行并行计算
                               scoring='accuracy') # 使用准确率作为评分标准
    start_time = time.time()
    grid_search.fit(X_train, y_train) 
    end_time = time.time()
    print(f"网格搜索耗时: {end_time - start_time:.4f} 秒")
    print("最佳参数: ", grid_search.best_params_) 
    best_model = grid_search.best_estimator_ 
    best_pred = best_model.predict(X_test)
    print("\n网格搜索优化后的LightGBM 在测试集上的分类报告:")
    print(classification_report(y_test, best_pred))
    print("网格搜索优化后的LightGBM 在测试集上的混淆矩阵:")
    print(confusion_matrix(y_test, best_pred))
    print("\n--- 3. 贝叶斯优化LightGBM (训练集 -> 测试集) ---")
    from skopt import BayesSearchCV
    from skopt.space import Integer, Real
    search_space = {
        'n_estimators': Integer(50, 200),
        'max_depth': Integer(10, 30),
        'learning_rate': Real(0.01, 0.3, prior='log-uniform'),  
        'reg_alpha':Real(0,1), 
        'reg_lambda': Real(0,1) 
    }
    bayes_search = BayesSearchCV(
        estimator=lgb.LGBMClassifier(random_state=42),
        search_spaces=search_space,
        n_iter=32,  
        cv=5, 
        n_jobs=-1,
        scoring='accuracy'
    )
    start_time = time.time()
    bayes_search.fit(X_train, y_train)
    end_time = time.time()
    print(f"贝叶斯优化耗时: {end_time - start_time:.4f} 秒")
    print("最佳参数: ", bayes_search.best_params_)
    best_model = bayes_search.best_estimator_
    best_pred = best_model.predict(X_test)
    print("\n贝叶斯优化后的LightGBM 在测试集上的分类报告:")
    print(classification_report(y_test, best_pred))
    print("贝叶斯优化后的LightGBM 在测试集上的混淆矩阵:")
    print(confusion_matrix(y_test, best_pred))

  @浙大疏锦行

你可能感兴趣的:(python,学习,机器学习)