60天Python训练 day12

常见的几种优化算法:

  1. 遗传算法
  2. 粒子群优化
  3. 模拟退火

核心思想:

  1. 这些启发式算法都是优化器。你的目标是找到一组超参数,让你的机器学习模型在某个指标(比如验证集准确率)上表现最好。
  2. 这个过程就像在一个复杂的地形(参数空间)上寻找最高峰(最佳性能)。
  3. 启发式算法就是一群聪明的“探险家”,它们用不同的策略(模仿自然、物理现象等)来寻找这个最高峰,而不需要知道地形每一处的精确梯度(导数)。

遗传算法

  • 灵感来源: 生物进化,达尔文的“适者生存”。
  • 简单理解: 把不同的超参数组合想象成一群“个体”。表现好的个体(高验证分)更有机会“繁殖”(它们的参数组合会被借鉴和混合),并可能发生“变异”(参数随机小改动),产生下一代。表现差的个体逐渐被淘汰。一代代下去,种群整体就会越来越适应环境(找到更好的超参数)。
  • 应用感觉: 像是在大范围“撒网”搜索,通过优胜劣汰和随机变动逐步逼近最优解。适合参数空间很大、很复杂的情况。
    # pip install deap -i https://pypi.tuna.tsinghua.edu.cn/simple
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
    from sklearn.metrics import classification_report, confusion_matrix
    import warnings
    warnings.filterwarnings("ignore")
    import time
    from deap import base, creator, tools, algorithms # DEAP是一个用于遗传算法和进化计算的Python库
    import random
    import numpy as np
    
    
    
    # --- 2. 遗传算法优化随机森林 ---
    print("\n--- 2. 遗传算法优化随机森林 (训练集 -> 测试集) ---")
    
    # 定义适应度函数和个体类型
    creator.create("FitnessMax", base.Fitness, weights=(1.0,))
    creator.create("Individual", list, fitness=creator.FitnessMax)
    
    # 定义超参数范围
    n_estimators_range = (50, 200)
    max_depth_range = (10, 30)
    min_samples_split_range = (2, 10)
    min_samples_leaf_range = (1, 4)
    
    # 初始化工具盒
    toolbox = base.Toolbox()
    
    # 定义基因生成器
    toolbox.register("attr_n_estimators", random.randint, *n_estimators_range)
    toolbox.register("attr_max_depth", random.randint, *max_depth_range)
    toolbox.register("attr_min_samples_split", random.randint, *min_samples_split_range)
    toolbox.register("attr_min_samples_leaf", random.randint, *min_samples_leaf_range)
    
    # 定义个体生成器
    toolbox.register("individual", tools.initCycle, creator.Individual,
                     (toolbox.attr_n_estimators, toolbox.attr_max_depth,
                      toolbox.attr_min_samples_split, toolbox.attr_min_samples_leaf), n=1)
    
    # 定义种群生成器
    toolbox.register("population", tools.initRepeat, list, toolbox.individual)
    
    # 定义评估函数
    def evaluate(individual):
        n_estimators, max_depth, min_samples_split, min_samples_leaf = individual
        model = RandomForestClassifier(n_estimators=n_estimators,
                                       max_depth=max_depth,
                                       min_samples_split=min_samples_split,
                                       min_samples_leaf=min_samples_leaf,
                                       random_state=42)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        return accuracy,
    
    # 注册评估函数
    toolbox.register("evaluate", evaluate)
    
    # 注册遗传操作
    toolbox.register("mate", tools.cxTwoPoint)
    toolbox.register("mutate", tools.mutUniformInt, low=[n_estimators_range[0], max_depth_range[0],
                                                         min_samples_split_range[0], min_samples_leaf_range[0]],
                     up=[n_estimators_range[1], max_depth_range[1],
                         min_samples_split_range[1], min_samples_leaf_range[1]], indpb=0.1)
    toolbox.register("select", tools.selTournament, tournsize=3)
    
    # 初始化种群
    pop = toolbox.population(n=20)
    
    # 遗传算法参数
    NGEN = 10
    CXPB = 0.5
    MUTPB = 0.2
    
    start_time = time.time()
    # 运行遗传算法
    for gen in range(NGEN):
        offspring = algorithms.varAnd(pop, toolbox, cxpb=CXPB, mutpb=MUTPB)
        fits = toolbox.map(toolbox.evaluate, offspring)
        for fit, ind in zip(fits, offspring):
            ind.fitness.values = fit
        pop = toolbox.select(offspring, k=len(pop))
    
    end_time = time.time()
    
    # 找到最优个体
    best_ind = tools.selBest(pop, k=1)[0]
    best_n_estimators, best_max_depth, best_min_samples_split, best_min_samples_leaf = best_ind
    
    print(f"遗传算法优化耗时: {end_time - start_time:.4f} 秒")
    print("最佳参数: ", {
        'n_estimators': best_n_estimators,
        'max_depth': best_max_depth,
        'min_samples_split': best_min_samples_split,
        'min_samples_leaf': best_min_samples_leaf
    })
    
    # 使用最佳参数的模型进行预测
    best_model = RandomForestClassifier(n_estimators=best_n_estimators,
                                        max_depth=best_max_depth,
                                        min_samples_split=best_min_samples_split,
                                        min_samples_leaf=best_min_samples_leaf,
                                        random_state=42)
    best_model.fit(X_train, y_train)
    best_pred = best_model.predict(X_test)
    
    print("\n遗传算法优化后的随机森林 在测试集上的分类报告:")
    print(classification_report(y_test, best_pred))
    print("遗传算法优化后的随机森林 在测试集上的混淆矩阵:")
    print(confusion_matrix(y_test, best_pred))

粒子群算法

  • 灵感来源: 鸟群或鱼群觅食。
  • 简单理解: 把每个超参数组合想象成一个“粒子”(鸟)。每个粒子在参数空间中“飞行”。它会记住自己飞过的最好位置,也会参考整个“鸟群”发现的最好位置,结合这两者来调整自己的飞行方向和速度,同时带点随机性。
  • 应用感觉: 像是一群探险家,既有自己的探索记忆,也会互相交流信息(全局最佳位置),集体协作寻找目标。通常收敛比遗传算法快一些。

粒子群方法的思想比较简单,所以甚至可以不调库自己实现。

print("\n--- 2. 粒子群优化算法优化随机森林 (训练集 -> 测试集) ---")


# 定义适应度函数,本质就是构建了一个函数实现 参数--> 评估指标的映射
def fitness_function(params): 
    n_estimators, max_depth, min_samples_split, min_samples_leaf = params # 序列解包,允许你将一个可迭代对象(如列表、元组、字符串等)中的元素依次赋值给多个变量。
    model = RandomForestClassifier(n_estimators=int(n_estimators),
                                   max_depth=int(max_depth),
                                   min_samples_split=int(min_samples_split),
                                   min_samples_leaf=int(min_samples_leaf),
                                   random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy


# 粒子群优化算法实现
def pso(num_particles, num_iterations, c1, c2, w, bounds): # 粒子群优化算法核心函数
    # num_particles:粒子的数量,即算法中用于搜索最优解的个体数量。
    # num_iterations:迭代次数,算法运行的最大循环次数。
    # c1:认知学习因子,用于控制粒子向自身历史最佳位置移动的程度。
    # c2:社会学习因子,用于控制粒子向全局最佳位置移动的程度。
    # w:惯性权重,控制粒子的惯性,影响粒子在搜索空间中的移动速度和方向。
    # bounds:超参数的取值范围,是一个包含多个元组的列表,每个元组表示一个超参数的最小值和最大值。

    num_params = len(bounds) 
    particles = np.array([[random.uniform(bounds[i][0], bounds[i][1]) for i in range(num_params)] for _ in
                          range(num_particles)])
    velocities = np.array([[0] * num_params for _ in range(num_particles)])
    personal_best = particles.copy()
    personal_best_fitness = np.array([fitness_function(p) for p in particles])
    global_best_index = np.argmax(personal_best_fitness)
    global_best = personal_best[global_best_index]
    global_best_fitness = personal_best_fitness[global_best_index]

    for _ in range(num_iterations):
        r1 = np.array([[random.random() for _ in range(num_params)] for _ in range(num_particles)])
        r2 = np.array([[random.random() for _ in range(num_params)] for _ in range(num_particles)])

        velocities = w * velocities + c1 * r1 * (personal_best - particles) + c2 * r2 * (
                global_best - particles)
        particles = particles + velocities

        for i in range(num_particles):
            for j in range(num_params):
                if particles[i][j] < bounds[j][0]:
                    particles[i][j] = bounds[j][0]
                elif particles[i][j] > bounds[j][1]:
                    particles[i][j] = bounds[j][1]

        fitness_values = np.array([fitness_function(p) for p in particles])
        improved_indices = fitness_values > personal_best_fitness
        personal_best[improved_indices] = particles[improved_indices]
        personal_best_fitness[improved_indices] = fitness_values[improved_indices]

        current_best_index = np.argmax(personal_best_fitness)
        if personal_best_fitness[current_best_index] > global_best_fitness:
            global_best = personal_best[current_best_index]
            global_best_fitness = personal_best_fitness[current_best_index]

    return global_best, global_best_fitness


# 超参数范围
bounds = [(50, 200), (10, 30), (2, 10), (1, 4)]  # n_estimators, max_depth, min_samples_split, min_samples_leaf

# 粒子群优化算法参数
num_particles = 20
num_iterations = 10
c1 = 1.5
c2 = 1.5
w = 0.5

start_time = time.time()
best_params, best_fitness = pso(num_particles, num_iterations, c1, c2, w, bounds)
end_time = time.time()

print(f"粒子群优化算法优化耗时: {end_time - start_time:.4f} 秒")
print("最佳参数: ", {
    'n_estimators': int(best_params[0]),
   'max_depth': int(best_params[1]),
   'min_samples_split': int(best_params[2]),
   'min_samples_leaf': int(best_params[3])
})

# 使用最佳参数的模型进行预测
best_model = RandomForestClassifier(n_estimators=int(best_params[0]),
                                    max_depth=int(best_params[1]),
                                    min_samples_split=int(best_params[2]),
                                    min_samples_leaf=int(best_params[3]),
                                    random_state=42)
best_model.fit(X_train, y_train)
best_pred = best_model.predict(X_test)

print("\n粒子群优化算法优化后的随机森林 在测试集上的分类报告:")
print(classification_report(y_test, best_pred))
print("粒子群优化算法优化后的随机森林 在测试集上的混淆矩阵:")
print(confusion_matrix(y_test, best_pred))

退火算法

  • 灵感来源: 金属冶炼中的退火过程(缓慢冷却使金属达到最低能量稳定态)。
  • 简单理解: 从一个随机的超参数组合开始。随机尝试改变一点参数。如果新组合更好,就接受它。如果新组合更差,也有一定概率接受它(尤其是在“高温”/搜索早期)。这个接受坏解的概率会随着时间(“降温”)慢慢变小。
  • 应用感觉: 像一个有点“冲动”的探险家,初期愿意尝试一些看起来不太好的路径(为了跳出局部最优的小山谷),后期则越来越“保守”,专注于在当前找到的好区域附近精细搜索。擅长避免陷入局部最优。
print("\n--- 2. 模拟退火算法优化随机森林 (训练集 -> 测试集) ---")


# 定义适应度函数
def fitness_function(params): 
    n_estimators, max_depth, min_samples_split, min_samples_leaf = params
    model = RandomForestClassifier(n_estimators=int(n_estimators),
                                   max_depth=int(max_depth),
                                   min_samples_split=int(min_samples_split),
                                   min_samples_leaf=int(min_samples_leaf),
                                   random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy


# 模拟退火算法实现
def simulated_annealing(initial_solution, bounds, initial_temp, final_temp, alpha):
    current_solution = initial_solution
    current_fitness = fitness_function(current_solution)
    best_solution = current_solution
    best_fitness = current_fitness
    temp = initial_temp

    while temp > final_temp:
        # 生成邻域解
        neighbor_solution = []
        for i in range(len(current_solution)):
            new_val = current_solution[i] + random.uniform(-1, 1) * (bounds[i][1] - bounds[i][0]) * 0.1
            new_val = max(bounds[i][0], min(bounds[i][1], new_val))
            neighbor_solution.append(new_val)

        neighbor_fitness = fitness_function(neighbor_solution)
        delta_fitness = neighbor_fitness - current_fitness

        if delta_fitness > 0 or random.random() < np.exp(delta_fitness / temp):
            current_solution = neighbor_solution
            current_fitness = neighbor_fitness

        if current_fitness > best_fitness:
            best_solution = current_solution
            best_fitness = current_fitness

        temp *= alpha

    return best_solution, best_fitness


# 超参数范围
bounds = [(50, 200), (10, 30), (2, 10), (1, 4)]  # n_estimators, max_depth, min_samples_split, min_samples_leaf

# 模拟退火算法参数
initial_temp = 100 # 初始温度
final_temp = 0.1 # 终止温度
alpha = 0.95 # 温度衰减系数

# 初始化初始解
initial_solution = [random.uniform(bounds[i][0], bounds[i][1]) for i in range(len(bounds))]

start_time = time.time()
best_params, best_fitness = simulated_annealing(initial_solution, bounds, initial_temp, final_temp, alpha)
end_time = time.time()

print(f"模拟退火算法优化耗时: {end_time - start_time:.4f} 秒")
print("最佳参数: ", {
    'n_estimators': int(best_params[0]),
    'max_depth': int(best_params[1]),
    'min_samples_split': int(best_params[2]),
    'min_samples_leaf': int(best_params[3])
})

# 使用最佳参数的模型进行预测
best_model = RandomForestClassifier(n_estimators=int(best_params[0]),
                                    max_depth=int(best_params[1]),
                                    min_samples_split=int(best_params[2]),
                                    min_samples_leaf=int(best_params[3]),
                                    random_state=42)
best_model.fit(X_train, y_train)
best_pred = best_model.predict(X_test)

print("\n模拟退火算法优化后的随机森林 在测试集上的分类报告:")
print(classification_report(y_test, best_pred))
print("模拟退火算法优化后的随机森林 在测试集上的混淆矩阵:")
print(confusion_matrix(y_test, best_pred))

@浙大疏锦行

你可能感兴趣的:(python,开发语言)