系统化学习人工智能网站(收藏)
:https://www.captainbed.cn/flu
随着人工智能技术的快速发展,强化学习(Reinforcement Learning, RL)作为机器学习的重要分支,正逐渐成为解决复杂决策问题的核心工具。OpenAI Gym作为最主流的强化学习环境库,提供了从经典控制任务到复杂游戏场景的多样化实验平台。本文以OpenAI Gym为载体,系统梳理强化学习核心算法(包括DQN、PPO、SAC等)的实现流程,结合代码实战演示AI从零基础到掌握CartPole、MountainCar、LunarLander等经典任务的完整过程。通过对比不同算法在收敛速度、样本效率及泛化能力上的差异,揭示强化学习落地的关键挑战与优化方向,为开发者提供从理论到实践的全链路指导。
强化学习通过智能体(Agent)与环境交互学习最优策略,其核心要素包括状态(State)、动作(Action)、奖励(Reward)及策略(Policy)。OpenAI Gym作为标准化实验平台,包含200+个环境,覆盖以下领域:
本文以Python 3.10 + Gym 1.0.0为技术栈,通过以下结构展开:
强化学习的马尔可夫决策过程(MDP)可表示为:
π ∗ ( s ) = arg max π E [ ∑ t = 0 ∞ γ t r ( s t , a t ) ] \pi^*(s) = \arg\max_{\pi} \mathbb{E}\left[\sum_{t=0}^{\infty} \gamma^t r(s_t, a_t)\right] π∗(s)=argπmaxE[t=0∑∞γtr(st,at)]
其中:
CartPole任务要求通过左右移动小车保持杆的直立状态:
import gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from collections import deque
import random
class DQN(nn.Module):
def __init__(self, state_dim, action_dim):
super().__init__()
self.fc1 = nn.Linear(state_dim, 128)
self.fc2 = nn.Linear(128, 64)
self.fc3 = nn.Linear(64, action_dim)
def forward(self, x):
x = torch.relu(self.fc1(x))
x = torch.relu(self.fc2(x))
return self.fc3(x)
class DQNAgent:
def __init__(self, state_dim, action_dim):
self.q_net = DQN(state_dim, action_dim)
self.target_q_net = DQN(state_dim, action_dim)
self.target_q_net.load_state_dict(self.q_net.state_dict())
self.optimizer = optim.Adam(self.q_net.parameters(), lr=0.001)
self.memory = deque(maxlen=10000)
self.gamma = 0.99
self.epsilon = 1.0
self.epsilon_decay = 0.995
self.epsilon_min = 0.01
def choose_action(self, state):
if random.random() < self.epsilon:
return random.randint(0, 1)
state = torch.FloatTensor(state).unsqueeze(0)
with torch.no_grad():
q_values = self.q_net(state)
return torch.argmax(q_values).item()
def learn(self, batch_size):
if len(self.memory) < batch_size:
return
batch = random.sample(self.memory, batch_size)
states, actions, rewards, next_states, dones = zip(*batch)
states = torch.FloatTensor(states)
actions = torch.LongTensor(actions)
rewards = torch.FloatTensor(rewards)
next_states = torch.FloatTensor(next_states)
dones = torch.FloatTensor(dones)
q_values = self.q_net(states).gather(1, actions.unsqueeze(1)).squeeze(1)
max_next_q_values = self.target_q_net(next_states).max(1)[0]
target_q_values = rewards + self.gamma * max_next_q_values * (1 - dones)
loss = nn.MSELoss()(q_values, target_q_values)
self.optimizer.zero_grad()
loss.backward()
self.optimizer.step()
self.epsilon = max(self.epsilon * self.epsilon_decay, self.epsilon_min)
if random.random() < 0.01:
self.target_q_net.load_state_dict(self.q_net.state_dict())
def train_cartpole():
env = gym.make('CartPole-v1')
agent = DQNAgent(env.observation_space.shape[0], env.action_space.n)
for episode in range(500):
state = env.reset()
total_reward = 0
while True:
action = agent.choose_action(state)
next_state, reward, done, _ = env.step(action)
agent.memory.append((state, action, reward, next_state, done))
agent.learn(32)
state = next_state
total_reward += reward
if done:
break
if episode % 50 == 0:
print(f"Episode {episode}, Total Reward: {total_reward}, Epsilon: {agent.epsilon:.2f}")
env.close()
if __name__ == "__main__":
train_cartpole()
MountainCar任务要求通过左右加速使小车到达山顶:
import gym
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from torch.distributions import Normal
class ActorCritic(nn.Module):
def __init__(self, state_dim, action_dim):
super().__init__()
self.actor = nn.Sequential(
nn.Linear(state_dim, 64),
nn.Tanh(),
nn.Linear(64, 64),
nn.Tanh(),
nn.Linear(64, action_dim),
nn.Tanh() # 输出范围[-1,1]
)
self.critic = nn.Sequential(
nn.Linear(state_dim, 64),
nn.Tanh(),
nn.Linear(64, 64),
nn.Tanh(),
nn.Linear(64, 1)
)
def forward(self, state):
mu = self.actor(state)
value = self.critic(state)
return mu, value
class PPOAgent:
def __init__(self, state_dim, action_dim):
self.policy = ActorCritic(state_dim, action_dim)
self.optimizer = optim.Adam(self.policy.parameters(), lr=0.0003)
self.gamma = 0.99
self.clip_range = 0.2
self.epochs = 10
self.batch_size = 64
self.memory = []
def choose_action(self, state):
state = torch.FloatTensor(state).unsqueeze(0)
mu, _ = self.policy(state)
action = mu.squeeze(0).numpy()
return action
def store_transition(self, state, action, reward, next_state, done):
self.memory.append((state, action, reward, next_state, done))
def compute_returns(self, rewards, dones):
returns = []
R = 0
for r, done in zip(reversed(rewards), reversed(dones)):
R = r + self.gamma * R * (1 - done)
returns.insert(0, R)
returns = torch.FloatTensor(returns)
returns = (returns - returns.mean()) / (returns.std() + 1e-8)
return returns
def learn(self):
states, actions, rewards, next_states, dones = zip(*self.memory)
states = torch.FloatTensor(states)
actions = torch.FloatTensor(actions)
returns = self.compute_returns(rewards, dones)
for _ in range(self.epochs):
indices = np.random.permutation(len(self.memory))
for i in range(0, len(self.memory), self.batch_size):
batch_indices = indices[i:i+self.batch_size]
batch_states = states[batch_indices]
batch_actions = actions[batch_indices]
batch_returns = returns[batch_indices]
mu, old_values = self.policy(batch_states)
dist = Normal(mu, 00.2) # 标准差固定为0.2
log_probs = dist.log_prob(batch_actions)
ratios = torch.exp(log_probs - dist.log_prob(batch_actions).detach()) # 固定旧策略概率
advantages = batch_returns - old_values.squeeze()
surr1 = ratios * advantages
surr2 = torch.clamp(ratios, 1 - self.clip_range, 1 + self.clip_range) * advantages
actor_loss = -torch.min(surr1, surr2).mean()
critic_loss = nn.MSELoss()(old_values.squeeze(), batch_returns)
loss = actor_loss + 0.5 * critic_loss
self.optimizer.zero_grad()
loss.backward()
self.optimizer.step()
self.memory = []
def train_mountaincar():
env = gym.make('MountainCarContinuous-v0')
agent = PPOAgent(env.observation_space.shape[0], env.action_space.shape[0])
for episode in range(300):
state = env.reset()
total_reward = 0
while True:
action = agent.choose_action(state)
next_state, reward, done, _ = env.step(action)
agent.store_transition(state, action, reward, next_state, done)
state = next_state
total_reward += reward
if done:
break
agent.learn()
if episode % 50 == 0:
print(f"Episode {episode}, Total Reward: {total_reward:.2f}")
env.close()
if __name__ == "__main__":
train_mountaincar()
LunarLander任务要求通过4个引擎控制着陆器安全降落在指定区域:
class SACAgent:
def __init__(self, state_dim, action_dim):
# 初始化策略网络、Q网络、目标Q网络等
pass
def choose_action(self, state, deterministic=False):
# 采样动作或选择确定性动作
pass
def update(self, batch_size):
# 更新策略网络、Q网络及目标网络
pass
def train_lunarlander():
env = gym.make('LunarLanderContinuous-v2')
agent = SACAgent(env.observation_space.shape[0], env.action_space.shape[0])
for episode in range(500):
state = env.reset()
total_reward = 0
while True:
action = agent.choose_action(state)
next_state, reward, done, _ = env.step(action)
agent.store_transition(state, action, reward, next_state, done)
state = next_state
total_reward += reward
if done:
break
agent.update(batch_size=256)
if episode % 50 == 0:
print(f"Episode {episode}, Total Reward: {total_reward:.2f}")
env.close()
算法 | 样本效率 | 收敛速度 | 稳定性 | 适用场景 |
---|---|---|---|---|
DQN | 低 | 中 | 中 | 离散动作空间 |
PPO | 中 | 高 | 高 | 连续动作空间 |
SAC | 高 | 中 | 高 | 高维连续动作空间 |
调试技巧:
性能优化:
OpenAI Gym为强化学习算法验证提供了标准化平台,本文通过CartPole、MountainCar、LunarLander三个典型任务,演示了从DQN到PPO、SAC的算法进化过程。实验结果表明:
随着A100/H100等GPU的普及及分布式训练框架的成熟,强化学习在机器人控制、自动驾驶、游戏AI等领域的应用将进一步加速。开发者需根据任务特性选择合适的算法,并通过工程化手段优化训练效率,最终实现从实验室到实际场景的落地。