前情提要:我们已经完成了监督学习和AutoML的完整流程。本期将探索人工智能的另一个重要领域——让机器通过试错自主学习的强化学习技术!
本期亮点:
算法 | 类型 | 适用场景 | 优势 | 实现模块 |
---|---|---|---|---|
Q-Learning | 值迭代 | 离散动作空间 | 简单稳定 | q_learning.py |
Deep Q Network | 值迭代 | 高维状态空间 | 处理复杂输入 | dqn.py |
Policy Gradients | 策略迭代 | 连续动作空间 | 直接优化策略 | policy_grad.py |
Proximal Policy Optimization | 策略迭代 | 复杂环境 | 训练稳定高效 | ppo.py |
"""
经典Q-Learning算法实现
适用于离散状态和动作空间的问题
"""
import numpy as np
import random
from collections import defaultdict
import logging
from typing import Dict, Tuple, Any
import pickle
import os
from config import MODEL_DIR
logger = logging.getLogger(__name__)
class QLearningAgent:
def __init__(self,
action_space: int,
learning_rate: float = 0.1,
discount_factor: float = 0.95,
exploration_rate: float = 1.0,
exploration_decay: float = 0.995,
min_exploration: float = 0.01):
"""初始化Q-Learning智能体"""
self.action_space = action_space
self.lr = learning_rate
self.gamma = discount_factor
self.epsilon = exploration_rate
self.epsilon_decay = exploration_decay
self.epsilon_min = min_exploration
self.q_table = defaultdict(lambda: np.zeros(action_space))
def get_action(self, state: Tuple) -> int:
"""根据当前状态选择动作"""
if random.random() < self.epsilon:
return random.randint(0, self.action_space - 1)
return np.argmax(self.q_table[state])
def update(self,
state: Tuple,
action: int,
reward: float,
next_state: Tuple,
done: bool):
"""更新Q表"""
current_q = self.q_table[state][action]
max_next_q = np.max(self.q_table[next_state]) if not done else 0
new_q = current_q + self.lr * (reward + self.gamma * max_next_q - current_q)
self.q_table[state][action] = new_q
# 衰减探索率
if done:
self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay)
def save(self, filename: str = "q_learning_model.pkl"):
"""保存Q表"""
os.makedirs(MODEL_DIR, exist_ok=True)
with open(os.path.join(MODEL_DIR, filename), 'wb') as f:
pickle.dump(dict(self.q_table), f)
logger.info(f"模型已保存到 {filename}")
def load(self, filename: str = "q_learning_model.pkl"):
"""加载Q表"""
try:
with open(os.path.join(MODEL_DIR, filename), 'rb') as f:
self.q_table = defaultdict(lambda: np.zeros(self.action_space), pickle.load(f))
logger.info(f"已从 {filename} 加载模型")
except FileNotFoundError:
logger.warning("未找到模型文件,初始化新Q表")
"""
深度Q网络(DQN)实现
包含经验回放和目标网络等改进
"""
import numpy as np
import random
from collections import deque
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
import logging
from typing import List, Tuple, Any
import os
from config import MODEL_DIR
logger = logging.getLogger(__name__)
class DQNAgent:
def __init__(self,
state_shape: Tuple,
action_space: int,
memory_size: int = 10000,
batch_size: int = 64,
learning_rate: float = 0.001,
discount_factor: float = 0.95,
exploration_rate: float = 1.0,
exploration_min: float = 0.01,
exploration_decay: float = 0.995,
target_update_freq: int = 100):
"""初始化DQN智能体"""
self.state_shape = state_shape
self.action_space = action_space
self.memory = deque(maxlen=memory_size)
self.batch_size = batch_size
self.gamma = discount_factor
self.epsilon = exploration_rate
self.epsilon_min = exploration_min
self.epsilon_decay = exploration_decay
self.target_update_freq = target_update_freq
self.train_step = 0
# 创建主网络和目标网络
self.model = self._build_model(learning_rate)
self.target_model = self._build_model(learning_rate)
self._update_target_model()
def _build_model(self, learning_rate: float) -> Sequential:
"""构建深度Q网络"""
model = Sequential([
Dense(64, input_shape=self.state_shape, activation='relu'),
Dense(64, activation='relu'),
Dense(self.action_space, activation='linear')
])
model.compile(loss='mse', optimizer=Adam(learning_rate=learning_rate))
return model
def _update_target_model(self):
"""更新目标网络权重"""
self.target_model.set_weights(self.model.get_weights())
def remember(self,
state: np.ndarray,
action: int,
reward: float,
next_state: np.ndarray,
done: bool):
"""存储经验到记忆回放缓冲区"""
self.memory.append((state, action, reward, next_state, done))
def get_action(self, state: np.ndarray) -> int:
"""根据当前状态选择动作"""
if np.random.rand() <= self.epsilon:
return random.randrange(self.action_space)
q_values = self.model.predict(state[np.newaxis], verbose=0)
return np.argmax(q_values[0])
def replay(self):
"""从记忆回放中学习"""
if len(self.memory) < self.batch_size:
return
minibatch = random.sample(self.memory, self.batch_size)
states = np.array([t[0] for t in minibatch])
actions = np.array([t[1] for t in minibatch])
rewards = np.array([t[2] for t in minibatch])
next_states = np.array([t[3] for t in minibatch])
dones = np.array([t[4] for t in minibatch])
# 计算目标Q值
targets = self.model.predict(states, verbose=0)
next_q_values = self.target_model.predict(next_states, verbose=0)
targets[range(self.batch_size), actions] = rewards + self.gamma * np.max(next_q_values, axis=1) * (1 - dones)
# 训练主网络
self.model.fit(states, targets, epochs=1, verbose=0)
# 衰减探索率
self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay)
# 定期更新目标网络
self.train_step += 1
if self.train_step % self.target_update_freq == 0:
self._update_target_model()
def save(self, filename: str = "dqn_model.h5"):
"""保存模型权重"""
os.makedirs(MODEL_DIR, exist_ok=True)
self.model.save(os.path.join(MODEL_DIR, filename))
logger.info(f"模型已保存到 {filename}")
def load(self, filename: str = "dqn_model.h5"):
"""加载模型权重"""
try:
self.model = tf.keras.models.load_model(os.path.join(MODEL_DIR, filename))
self._update_target_model()
logger.info(f"已从 {filename} 加载模型")
except OSError:
logger.warning("未找到模型文件,初始化新模型")
"""
策略梯度(Policy Gradient)算法实现
适用于连续动作空间问题
"""
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
import logging
from typing import List, Tuple
import os
from config import MODEL_DIR
logger = logging.getLogger(__name__)
class PolicyGradientAgent:
def __init__(self,
state_shape: Tuple,
action_space: int,
learning_rate: float = 0.01,
discount_factor: float = 0.95):
"""初始化策略梯度智能体"""
self.state_shape = state_shape
self.action_space = action_space
self.gamma = discount_factor
self.states = []
self.actions = []
self.rewards = []
# 构建策略网络
self.model = self._build_model(learning_rate)
def _build_model(self, learning_rate: float) -> Sequential:
"""构建策略网络"""
model = Sequential([
Dense(64, input_shape=self.state_shape, activation='relu'),
Dense(64, activation='relu'),
Dense(self.action_space, activation='softmax')
])
model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=learning_rate))
return model
def get_action(self, state: np.ndarray) -> int:
"""根据策略选择动作"""
state = state[np.newaxis]
probs = self.model.predict(state, verbose=0)[0]
return np.random.choice(self.action_space, p=probs)
def remember(self,
state: np.ndarray,
action: int,
reward: float):
"""存储轨迹数据"""
self.states.append(state)
self.actions.append(action)
self.rewards.append(reward)
def train(self):
"""训练策略网络"""
# 计算折扣回报
discounted_rewards = []
running_reward = 0
for r in reversed(self.rewards):
running_reward = r + self.gamma * running_reward
discounted_rewards.insert(0, running_reward)
# 标准化回报
discounted_rewards = np.array(discounted_rewards)
discounted_rewards = (discounted_rewards - np.mean(discounted_rewards)) / (np.std(discounted_rewards) + 1e-8)
# 准备训练数据
states = np.vstack(self.states)
actions = np.array(self.actions)
# 创建动作的one-hot编码
actions_onehot = np.zeros((len(actions), self.action_space))
actions_onehot[np.arange(len(actions)), actions] = 1
# 训练模型
self.model.train_on_batch(states, actions_onehot, sample_weight=discounted_rewards)
# 清空轨迹
self.states = []
self.actions = []
self.rewards = []
def save(self, filename: str = "policy_grad_model.h5"):
"""保存模型"""
os.makedirs(MODEL_DIR, exist_ok=True)
self.model.save(os.path.join(MODEL_DIR, filename))
logger.info(f"模型已保存到 {filename}")
def load(self, filename: str = "policy_grad_model.h5"):
"""加载模型"""
try:
self.model = tf.keras.models.load_model(os.path.join(MODEL_DIR, filename))
logger.info(f"已从 {filename} 加载模型")
except OSError:
logger.warning("未找到模型文件,初始化新模型")
"""
近端策略优化(PPO)算法实现
包含Clipped Surrogate Objective和GAE等改进
"""
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
import tensorflow_probability as tfp
import logging
from typing import Tuple, List
import os
from config import MODEL_DIR
logger = logging.getLogger(__name__)
class PPOAgent:
def __init__(self,
state_shape: Tuple,
action_space: int,
policy_lr: float = 0.0003,
value_lr: float = 0.001,
gamma: float = 0.99,
gae_lambda: float = 0.95,
clip_ratio: float = 0.2,
train_iters: int = 10,
batch_size: int = 64):
"""初始化PPO智能体"""
self.state_shape = state_shape
self.action_space = action_space
self.gamma = gamma
self.gae_lambda = gae_lambda
self.clip_ratio = clip_ratio
self.train_iters = train_iters
self.batch_size = batch_size
# 创建策略网络和价值网络
self.policy, self.value = self._build_networks(policy_lr, value_lr)
self.policy_old = self._build_networks(policy_lr, value_lr)[0]
self._update_old_policy()
def _build_networks(self, policy_lr: float, value_lr: float) -> Tuple[Model, Model]:
"""构建策略网络和价值网络"""
# 共享特征提取层
inputs = Input(shape=self.state_shape)
dense1 = Dense(64, activation='tanh')(inputs)
dense2 = Dense(64, activation='tanh')(dense1)
# 策略网络输出动作概率分布
policy_dense = Dense(64, activation='tanh')(dense2)
policy_output = Dense(self.action_space, activation='softmax')(policy_dense)
policy = Model(inputs=inputs, outputs=policy_output)
policy.compile(optimizer=Adam(learning_rate=policy_lr))
# 价值网络输出状态价值
value_dense = Dense(64, activation='tanh')(dense2)
value_output = Dense(1)(value_dense)
value = Model(inputs=inputs, outputs=value_output)
value.compile(optimizer=Adam(learning_rate=value_lr), loss='mse')
return policy, value
def _update_old_policy(self):
"""更新旧策略网络"""
self.policy_old.set_weights(self.policy.get_weights())
def get_action(self, state: np.ndarray) -> Tuple[int, float]:
"""根据策略选择动作"""
state = state[np.newaxis]
probs = self.policy_old.predict(state, verbose=0)[0]
dist = tfp.distributions.Categorical(probs=probs)
action = dist.sample()
log_prob = dist.log_prob(action)
return int(action.numpy()), float(log_prob.numpy())
def compute_gae(self,
rewards: np.ndarray,
values: np.ndarray,
dones: np.ndarray) -> np.ndarray:
"""计算广义优势估计(GAE)"""
gae = 0
advantages = np.zeros_like(rewards)
for t in reversed(range(len(rewards) - 1)):
delta = rewards[t] + self.gamma * values[t + 1] * (1 - dones[t]) - values[t]
gae = delta + self.gamma * self.gae_lambda * (1 - dones[t]) * gae
advantages[t] = gae
return advantages
def train(self,
states: np.ndarray,
actions: np.ndarray,
log_probs: np.ndarray,
rewards: np.ndarray,
dones: np.ndarray,
next_states: np.ndarray):
"""训练PPO网络"""
# 计算价值估计和优势
values = self.value.predict(states, verbose=0).flatten()
next_values = self.value.predict(next_states, verbose=0).flatten()
# 计算GAE
advantages = self.compute_gae(rewards, np.append(values, next_values[-1]), dones)
returns = advantages + values
# 标准化优势
advantages = (advantages - np.mean(advantages)) / (np.std(advantages) + 1e-8)
# 转换为TensorFlow张量
states_tf = tf.convert_to_tensor(states, dtype=tf.float32)
actions_tf = tf.convert_to_tensor(actions, dtype=tf.int32)
old_log_probs_tf = tf.convert_to_tensor(log_probs, dtype=tf.float32)
advantages_tf = tf.convert_to_tensor(advantages, dtype=tf.float32)
returns_tf = tf.convert_to_tensor(returns, dtype=tf.float32)
# 多次训练迭代
for _ in range(self.train_iters):
# 随机打乱数据
---
剩余代码克制评论留意关注,提供完整代码