Python强化学习实战:从游戏AI到工业控制的完整指南

Python人工智能模型训练实战(六):强化学习从入门到工业级应用

核心价值

前情提要:我们已经完成了监督学习和AutoML的完整流程。本期将探索人工智能的另一个重要领域——让机器通过试错自主学习的强化学习技术!

本期亮点

  • 4大核心强化学习算法完整实现
  • 从游戏AI到工业控制的实战案例
  • 生产环境部署与性能优化技巧
  • 分布式强化学习系统架构
  • 完整的可运维代码实现

强化学习算法对比(表格呈现)

算法 类型 适用场景 优势 实现模块
Q-Learning 值迭代 离散动作空间 简单稳定 q_learning.py
Deep Q Network 值迭代 高维状态空间 处理复杂输入 dqn.py
Policy Gradients 策略迭代 连续动作空间 直接优化策略 policy_grad.py
Proximal Policy Optimization 策略迭代 复杂环境 训练稳定高效 ppo.py

代码实现(分模块)

1. Q-Learning实现 (q_learning.py)

"""
经典Q-Learning算法实现
适用于离散状态和动作空间的问题
"""

import numpy as np
import random
from collections import defaultdict
import logging
from typing import Dict, Tuple, Any
import pickle
import os
from config import MODEL_DIR

logger = logging.getLogger(__name__)

class QLearningAgent:
    def __init__(self,
                action_space: int,
                learning_rate: float = 0.1,
                discount_factor: float = 0.95,
                exploration_rate: float = 1.0,
                exploration_decay: float = 0.995,
                min_exploration: float = 0.01):
        """初始化Q-Learning智能体"""
        self.action_space = action_space
        self.lr = learning_rate
        self.gamma = discount_factor
        self.epsilon = exploration_rate
        self.epsilon_decay = exploration_decay
        self.epsilon_min = min_exploration
        self.q_table = defaultdict(lambda: np.zeros(action_space))
        
    def get_action(self, state: Tuple) -> int:
        """根据当前状态选择动作"""
        if random.random() < self.epsilon:
            return random.randint(0, self.action_space - 1)
        return np.argmax(self.q_table[state])
    
    def update(self,
              state: Tuple,
              action: int,
              reward: float,
              next_state: Tuple,
              done: bool):
        """更新Q表"""
        current_q = self.q_table[state][action]
        max_next_q = np.max(self.q_table[next_state]) if not done else 0
        new_q = current_q + self.lr * (reward + self.gamma * max_next_q - current_q)
        self.q_table[state][action] = new_q
        
        # 衰减探索率
        if done:
            self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay)
    
    def save(self, filename: str = "q_learning_model.pkl"):
        """保存Q表"""
        os.makedirs(MODEL_DIR, exist_ok=True)
        with open(os.path.join(MODEL_DIR, filename), 'wb') as f:
            pickle.dump(dict(self.q_table), f)
        logger.info(f"模型已保存到 {filename}")
    
    def load(self, filename: str = "q_learning_model.pkl"):
        """加载Q表"""
        try:
            with open(os.path.join(MODEL_DIR, filename), 'rb') as f:
                self.q_table = defaultdict(lambda: np.zeros(self.action_space), pickle.load(f))
            logger.info(f"已从 {filename} 加载模型")
        except FileNotFoundError:
            logger.warning("未找到模型文件,初始化新Q表")

2. Deep Q Network实现 (dqn.py)

"""
深度Q网络(DQN)实现
包含经验回放和目标网络等改进
"""

import numpy as np
import random
from collections import deque
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
import logging
from typing import List, Tuple, Any
import os
from config import MODEL_DIR

logger = logging.getLogger(__name__)

class DQNAgent:
    def __init__(self,
                state_shape: Tuple,
                action_space: int,
                memory_size: int = 10000,
                batch_size: int = 64,
                learning_rate: float = 0.001,
                discount_factor: float = 0.95,
                exploration_rate: float = 1.0,
                exploration_min: float = 0.01,
                exploration_decay: float = 0.995,
                target_update_freq: int = 100):
        """初始化DQN智能体"""
        self.state_shape = state_shape
        self.action_space = action_space
        self.memory = deque(maxlen=memory_size)
        self.batch_size = batch_size
        self.gamma = discount_factor
        self.epsilon = exploration_rate
        self.epsilon_min = exploration_min
        self.epsilon_decay = exploration_decay
        self.target_update_freq = target_update_freq
        self.train_step = 0
        
        # 创建主网络和目标网络
        self.model = self._build_model(learning_rate)
        self.target_model = self._build_model(learning_rate)
        self._update_target_model()
    
    def _build_model(self, learning_rate: float) -> Sequential:
        """构建深度Q网络"""
        model = Sequential([
            Dense(64, input_shape=self.state_shape, activation='relu'),
            Dense(64, activation='relu'),
            Dense(self.action_space, activation='linear')
        ])
        model.compile(loss='mse', optimizer=Adam(learning_rate=learning_rate))
        return model
    
    def _update_target_model(self):
        """更新目标网络权重"""
        self.target_model.set_weights(self.model.get_weights())
    
    def remember(self,
                state: np.ndarray,
                action: int,
                reward: float,
                next_state: np.ndarray,
                done: bool):
        """存储经验到记忆回放缓冲区"""
        self.memory.append((state, action, reward, next_state, done))
    
    def get_action(self, state: np.ndarray) -> int:
        """根据当前状态选择动作"""
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_space)
        q_values = self.model.predict(state[np.newaxis], verbose=0)
        return np.argmax(q_values[0])
    
    def replay(self):
        """从记忆回放中学习"""
        if len(self.memory) < self.batch_size:
            return
        
        minibatch = random.sample(self.memory, self.batch_size)
        states = np.array([t[0] for t in minibatch])
        actions = np.array([t[1] for t in minibatch])
        rewards = np.array([t[2] for t in minibatch])
        next_states = np.array([t[3] for t in minibatch])
        dones = np.array([t[4] for t in minibatch])
        
        # 计算目标Q值
        targets = self.model.predict(states, verbose=0)
        next_q_values = self.target_model.predict(next_states, verbose=0)
        targets[range(self.batch_size), actions] = rewards + self.gamma * np.max(next_q_values, axis=1) * (1 - dones)
        
        # 训练主网络
        self.model.fit(states, targets, epochs=1, verbose=0)
        
        # 衰减探索率
        self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay)
        
        # 定期更新目标网络
        self.train_step += 1
        if self.train_step % self.target_update_freq == 0:
            self._update_target_model()
    
    def save(self, filename: str = "dqn_model.h5"):
        """保存模型权重"""
        os.makedirs(MODEL_DIR, exist_ok=True)
        self.model.save(os.path.join(MODEL_DIR, filename))
        logger.info(f"模型已保存到 {filename}")
    
    def load(self, filename: str = "dqn_model.h5"):
        """加载模型权重"""
        try:
            self.model = tf.keras.models.load_model(os.path.join(MODEL_DIR, filename))
            self._update_target_model()
            logger.info(f"已从 {filename} 加载模型")
        except OSError:
            logger.warning("未找到模型文件,初始化新模型")

3. 策略梯度实现 (policy_grad.py)

"""
策略梯度(Policy Gradient)算法实现
适用于连续动作空间问题
"""

import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
import logging
from typing import List, Tuple
import os
from config import MODEL_DIR

logger = logging.getLogger(__name__)

class PolicyGradientAgent:
    def __init__(self,
                state_shape: Tuple,
                action_space: int,
                learning_rate: float = 0.01,
                discount_factor: float = 0.95):
        """初始化策略梯度智能体"""
        self.state_shape = state_shape
        self.action_space = action_space
        self.gamma = discount_factor
        self.states = []
        self.actions = []
        self.rewards = []
        
        # 构建策略网络
        self.model = self._build_model(learning_rate)
    
    def _build_model(self, learning_rate: float) -> Sequential:
        """构建策略网络"""
        model = Sequential([
            Dense(64, input_shape=self.state_shape, activation='relu'),
            Dense(64, activation='relu'),
            Dense(self.action_space, activation='softmax')
        ])
        model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=learning_rate))
        return model
    
    def get_action(self, state: np.ndarray) -> int:
        """根据策略选择动作"""
        state = state[np.newaxis]
        probs = self.model.predict(state, verbose=0)[0]
        return np.random.choice(self.action_space, p=probs)
    
    def remember(self,
                state: np.ndarray,
                action: int,
                reward: float):
        """存储轨迹数据"""
        self.states.append(state)
        self.actions.append(action)
        self.rewards.append(reward)
    
    def train(self):
        """训练策略网络"""
        # 计算折扣回报
        discounted_rewards = []
        running_reward = 0
        for r in reversed(self.rewards):
            running_reward = r + self.gamma * running_reward
            discounted_rewards.insert(0, running_reward)
        
        # 标准化回报
        discounted_rewards = np.array(discounted_rewards)
        discounted_rewards = (discounted_rewards - np.mean(discounted_rewards)) / (np.std(discounted_rewards) + 1e-8)
        
        # 准备训练数据
        states = np.vstack(self.states)
        actions = np.array(self.actions)
        
        # 创建动作的one-hot编码
        actions_onehot = np.zeros((len(actions), self.action_space))
        actions_onehot[np.arange(len(actions)), actions] = 1
        
        # 训练模型
        self.model.train_on_batch(states, actions_onehot, sample_weight=discounted_rewards)
        
        # 清空轨迹
        self.states = []
        self.actions = []
        self.rewards = []
    
    def save(self, filename: str = "policy_grad_model.h5"):
        """保存模型"""
        os.makedirs(MODEL_DIR, exist_ok=True)
        self.model.save(os.path.join(MODEL_DIR, filename))
        logger.info(f"模型已保存到 {filename}")
    
    def load(self, filename: str = "policy_grad_model.h5"):
        """加载模型"""
        try:
            self.model = tf.keras.models.load_model(os.path.join(MODEL_DIR, filename))
            logger.info(f"已从 {filename} 加载模型")
        except OSError:
            logger.warning("未找到模型文件,初始化新模型")

4. PPO实现 (ppo.py)

"""
近端策略优化(PPO)算法实现
包含Clipped Surrogate Objective和GAE等改进
"""

import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
import tensorflow_probability as tfp
import logging
from typing import Tuple, List
import os
from config import MODEL_DIR

logger = logging.getLogger(__name__)

class PPOAgent:
    def __init__(self,
                state_shape: Tuple,
                action_space: int,
                policy_lr: float = 0.0003,
                value_lr: float = 0.001,
                gamma: float = 0.99,
                gae_lambda: float = 0.95,
                clip_ratio: float = 0.2,
                train_iters: int = 10,
                batch_size: int = 64):
        """初始化PPO智能体"""
        self.state_shape = state_shape
        self.action_space = action_space
        self.gamma = gamma
        self.gae_lambda = gae_lambda
        self.clip_ratio = clip_ratio
        self.train_iters = train_iters
        self.batch_size = batch_size
        
        # 创建策略网络和价值网络
        self.policy, self.value = self._build_networks(policy_lr, value_lr)
        self.policy_old = self._build_networks(policy_lr, value_lr)[0]
        self._update_old_policy()
    
    def _build_networks(self, policy_lr: float, value_lr: float) -> Tuple[Model, Model]:
        """构建策略网络和价值网络"""
        # 共享特征提取层
        inputs = Input(shape=self.state_shape)
        dense1 = Dense(64, activation='tanh')(inputs)
        dense2 = Dense(64, activation='tanh')(dense1)
        
        # 策略网络输出动作概率分布
        policy_dense = Dense(64, activation='tanh')(dense2)
        policy_output = Dense(self.action_space, activation='softmax')(policy_dense)
        policy = Model(inputs=inputs, outputs=policy_output)
        policy.compile(optimizer=Adam(learning_rate=policy_lr))
        
        # 价值网络输出状态价值
        value_dense = Dense(64, activation='tanh')(dense2)
        value_output = Dense(1)(value_dense)
        value = Model(inputs=inputs, outputs=value_output)
        value.compile(optimizer=Adam(learning_rate=value_lr), loss='mse')
        
        return policy, value
    
    def _update_old_policy(self):
        """更新旧策略网络"""
        self.policy_old.set_weights(self.policy.get_weights())
    
    def get_action(self, state: np.ndarray) -> Tuple[int, float]:
        """根据策略选择动作"""
        state = state[np.newaxis]
        probs = self.policy_old.predict(state, verbose=0)[0]
        dist = tfp.distributions.Categorical(probs=probs)
        action = dist.sample()
        log_prob = dist.log_prob(action)
        return int(action.numpy()), float(log_prob.numpy())
    
    def compute_gae(self,
                   rewards: np.ndarray,
                   values: np.ndarray,
                   dones: np.ndarray) -> np.ndarray:
        """计算广义优势估计(GAE)"""
        gae = 0
        advantages = np.zeros_like(rewards)
        for t in reversed(range(len(rewards) - 1)):
            delta = rewards[t] + self.gamma * values[t + 1] * (1 - dones[t]) - values[t]
            gae = delta + self.gamma * self.gae_lambda * (1 - dones[t]) * gae
            advantages[t] = gae
        return advantages
    
    def train(self,
              states: np.ndarray,
              actions: np.ndarray,
              log_probs: np.ndarray,
              rewards: np.ndarray,
              dones: np.ndarray,
              next_states: np.ndarray):
        """训练PPO网络"""
        # 计算价值估计和优势
        values = self.value.predict(states, verbose=0).flatten()
        next_values = self.value.predict(next_states, verbose=0).flatten()
        
        # 计算GAE
        advantages = self.compute_gae(rewards, np.append(values, next_values[-1]), dones)
        returns = advantages + values
        
        # 标准化优势
        advantages = (advantages - np.mean(advantages)) / (np.std(advantages) + 1e-8)
        
        # 转换为TensorFlow张量
        states_tf = tf.convert_to_tensor(states, dtype=tf.float32)
        actions_tf = tf.convert_to_tensor(actions, dtype=tf.int32)
        old_log_probs_tf = tf.convert_to_tensor(log_probs, dtype=tf.float32)
        advantages_tf = tf.convert_to_tensor(advantages, dtype=tf.float32)
        returns_tf = tf.convert_to_tensor(returns, dtype=tf.float32)
        
        # 多次训练迭代
        for _ in range(self.train_iters):
            # 随机打乱数据
           


---
剩余代码克制评论留意关注,提供完整代码

你可能感兴趣的:(AI,行业应用实战先锋,Python,实战项目大揭秘,python,游戏,人工智能)