系统化学习人工智能网站(收藏)
:https://www.captainbed.cn/flu
强化学习(Reinforcement Learning, RL)作为机器学习的重要分支,通过智能体(Agent)与环境交互实现策略优化。本文以OpenAI Gym经典环境(如CartPole、FrozenLake、MountainCar)为载体,系统性解析Q-learning算法的原理、实现步骤与优化技巧。通过Python代码实战,展示从基础算法到深度Q网络(DQN)的演进路径,并对比不同超参数(学习率、折扣因子、探索率)对收敛速度的影响。本文为强化学习初学者提供可复现的实践指南,同时探讨算法在机器人控制、自动驾驶等领域的扩展应用。
强化学习通过“试错-反馈”机制解决序列决策问题,其核心要素包括:
Q-learning作为无模型(Model-Free)算法的代表,通过更新Q表(状态-动作价值表)实现策略学习,其核心公式为:
[ Q(s,a) \leftarrow Q(s,a) + \alpha \left[ r + \gamma \max_{a’} Q(s’,a’) - Q(s,a) \right] ]
其中:
本文通过三个经典环境,逐步深入Q-learning的实现细节。
import gym
import numpy as np
import matplotlib.pyplot as plt
# 初始化环境
env = gym.make('CartPole-v1', render_mode='human')
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
# 初始化Q表
q_table = np.zeros((100, 100, 100, 100, action_size)) # 状态离散化
# 超参数
alpha = 0.1
gamma = 0.99
epsilon = 0.1
episodes = 10000
def discretize_state(state):
"""将连续状态离散化为整数索引"""
bins = [20, 20, 20, 20] # 每维度20个区间
return tuple(np.digitize(state, np.linspace(-4.8, 4.8, bins[0]+1)[:-1])[:4])
# 训练过程
rewards = []
for episode in range(episodes):
state = env.reset()[0]
state_disc = discretize_state(state)
total_reward = 0
done = False
while not done:
# ε-贪婪策略选择动作
if np.random.rand() < epsilon:
action = env.action_space.sample()
else:
action = np.argmax(q_table[state_disc])
# 执行动作
next_state, reward, done, _, _ = env.step(action)
next_state_disc = discretize_state(next_state)
total_reward += reward
# Q表更新
best_next_action = np.argmax(q_table[next_state_disc])
td_target = reward + gamma * q_table[next_state_disc][best_next_action]
td_error = td_target - q_table[state_disc][action]
q_table[state_disc][action] += alpha * td_error
state_disc = next_state_disc
rewards.append(total_reward)
if episode % 100 == 0:
print(f"Episode {episode}, Reward: {total_reward}")
# 绘制奖励曲线
plt.plot(rewards)
plt.xlabel('Episode')
plt.ylabel('Total Reward')
plt.title('Q-learning on CartPole-v1')
plt.show()
env = gym.make('FrozenLake-v1', desc=None, map_name="4x4", is_slippery=True)
state_size = env.observation_space.n
action_size = env.action_space.n
# 初始化Q表
q_table = np.zeros((state_size, action_size))
# 超参数
alpha = 0.85 # 高学习率应对随机性
gamma = 0.95
epsilon = 0.1
episodes = 10000
# 训练过程
success_rates = []
for episode in range(episodes):
state = env.reset()[0]
total_reward = 0
done = False
while not done:
if np.random.rand() < epsilon:
action = env.action_space.sample()
else:
action = np.argmax(q_table[state])
next_state, reward, done, _, _ = env.step(action)
total_reward += reward
# 更新Q表(处理滑冰导致的意外状态转移)
best_next_action = np.argmax(q_table[next_state])
td_target = reward + gamma * q_table[next_state][best_next_action]
td_error = td_target - q_table[state][action]
q_table[state][action] += alpha * td_error
state = next_state
# 计算成功率(到达目标奖励为+1)
success_rate = 1 if total_reward > 0 else 0
success_rates.append(success_rate)
if episode % 1000 == 0:
print(f"Episode {episode}, Success Rate: {np.mean(success_rates[-1000:]):.2%}")
# 绘制成功率曲线
plt.plot(success_rates)
plt.xlabel('Episode')
plt.ylabel('Success Rate')
plt.title('Q-learning on FrozenLake-v1 (Slippery)')
plt.show()
env = gym.make('MountainCar-v0', render_mode='human')
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
# 初始化Q表
q_table = np.zeros((100, 100, action_size)) # 位置和速度各离散为100区间
# 超参数
alpha = 0.1
gamma = 0.99
epsilon = 0.1
episodes = 20000
def discretize_mountaincar_state(state):
"""离散化MountainCar状态"""
pos_low, pos_high = -1.2, 0.6
vel_low, vel_high = -0.07, 0.07
pos_scaled = int((state[0] - pos_low) / (pos_high - pos_low) * 99)
vel_scaled = int((state[1] - vel_low) / (vel_high - vel_low) * 99)
return (pos_scaled, vel_scaled)
# 训练过程(带奖励塑形)
rewards = []
for episode in range(episodes):
state = env.reset()[0]
state_disc = discretize_mountaincar_state(state)
total_reward = 0
done = False
while not done:
if np.random.rand() < epsilon:
action = env.action_space.sample()
else:
action = np.argmax(q_table[state_disc])
next_state, reward, done, _, _ = env.step(action)
next_state_disc = discretize_mountaincar_state(next_state)
total_reward += reward
# 奖励塑形:原始奖励为-1,增加位置奖励鼓励向上移动
shaped_reward = reward + 0.1 * next_state[0] # 位置越靠右奖励越高
best_next_action = np.argmax(q_table[next_state_disc])
td_target = shaped_reward + gamma * q_table[next_state_disc][best_next_action]
td_error = td_target - q_table[state_disc][action]
q_table[state_disc][action] += alpha * td_error
state_disc = next_state_disc
rewards.append(total_reward)
if episode % 1000 == 0:
print(f"Episode {episode}, Steps: {-total_reward}") # 原始奖励为负步数
# 绘制步数曲线
plt.plot([-r for r in rewards])
plt.xlabel('Episode')
plt.ylabel('Steps to Reach Goal')
plt.title('Q-learning on MountainCar-v0 (with Reward Shaping)')
plt.show()
import torch
import torch.nn as nn
import torch.optim as optim
from collections import deque
import random
class DQN(nn.Module):
def __init__(self, state_size, action_size):
super(DQN, self).__init__()
self.fc1 = nn.Linear(state_size, 128)
self.fc2 = nn.Linear(128, 128)
self.fc3 = nn.Linear(128, action_size)
def forward(self, x):
x = torch.relu(self.fc1(x))
x = torch.relu(self.fc2(x))
return self.fc3(x)
# 经验回放池
replay_buffer = deque(maxlen=100000)
# 训练循环(伪代码)
for episode in range(episodes):
state = env.reset()[0]
total_reward = 0
done = False
while not done:
# ε-贪婪策略
if np.random.rand() < epsilon:
action = env.action_space.sample()
else:
state_tensor = torch.FloatTensor(state)
with torch.no_grad():
action = torch.argmax(dqn(state_tensor)).item()
next_state, reward, done, _, _ = env.step(action)
replay_buffer.append((state, action, reward, next_state, done))
# 批量采样更新
batch = random.sample(replay_buffer, 32)
states, actions, rewards, next_states, dones = zip(*batch)
states = torch.FloatTensor(states)
actions = torch.LongTensor(actions)
rewards = torch.FloatTensor(rewards)
next_states = torch.FloatTensor(next_states)
dones = torch.FloatTensor(dones)
# 计算Q值
q_values = dqn(states).gather(1, actions.unsqueeze(1)).squeeze()
next_q_values = target_dqn(next_states).max(1)[0]
target_q_values = rewards + gamma * next_q_values * (1 - dones)
# 更新DQN
loss = nn.MSELoss()(q_values, target_q_values)
optimizer.zero_grad()
loss.backward()
optimizer.step()
# 定期同步目标网络
if episode % 100 == 0:
target_dqn.load_state_dict(dqn.state_dict())
Q-learning的核心价值:
DQN的扩展性:
未来方向:
强化学习从理论到实践的跨越,需结合环境特性选择算法,并通过超参数调优、奖励设计等工程技巧提升性能。随着计算资源(如TPU集群)和仿真平台(如CARLA)的发展,强化学习将在自动驾驶、机器人等领域释放更大潜力。