深度强化学习算法PPO训练CartPole

PPO代码部分,训练离散动作

1.  导入必须要的包

import torch
import torch.nn as nn
import torch.nn.functional as f
import torch.optim as optim
from torch.distributions import Categorical
from collections import deque
import random
import matplotlib.pyplot as plt
import numpy as np
import gym
import copy

2.  定义actor网络和critic网络

class actor_net(nn.Module):
    def __init__(self, state_n, action_n, hidden_n):
        super(actor_net, self).__init__()
        self.fc1 = nn.Linear(state_n, hidden_n)
        self.fc2 = nn.Linear(hidden_n, hidden_n)
        self.fc3 = nn.Linear(hidden_n, action_n)
    def forward(self, x):
        x = f.relu(self.fc1(x))
        x = f.relu(self.fc2(x))
        probs = f.softmax(self.fc3(x), dim = 1)
        return probs
class critic_net(nn.Module):
    def __init__(self, state_n, hidden_n):
        super(critic_net, self).__init__()
        self.fc1 = nn.Linear(state_n, hidden_n)
        self.fc2 = nn.Linear(hidden_n, hidden_n)
        self.fc3 = nn.Linear(hidden_n, 1)
    def forward(self, x):
        x = f.relu(self.fc1(x))
        x = f.relu(self.fc2(x))
        value = self.fc3(x)
        return value

3.  定义存储数据的buffer,包括buffer添加数据,采样数据,清空数据

class buffer(object):
    def __init__(self, length):
        self.buffer_length = length
        self.buffer = deque(maxlen = self.buffer_length)
    def push(self, trans):
        self.buffer.append(trans)
    def sample(self):
        batch = list(self.buffer)
        return zip(*batch)
    def clear(self):
        self.buffer.clear()
    def length(self):
        return len(self.buffer)

4.  定义config

class config():
    def __init__(self):
        self.env_name = 'CartPole-v1'
        self.train_eps = 200
        self.test_eps = 20
        self.max_step = 200
        self.eval_eps = 5
        self.eval_per_ep = 10
        self.gamma = 0.99
        self.actor_lr = 0.0003
        self.critic_lr = 0.0003
        self.buffer_length = 100
        self.eps_clip = 0.2
        self.entropy_coef = 0.01
        self.batch_size = 100
        self.update_n = 4
        self.hidden_n = 256
        self.seed = 1
        self.device = 'cpu'

5.  定义PPO,包括根据状态采样动作,PPO更新

class PPO():
    def __init__(self, cfg):
        self.gamma = cfg.gamma
        self.device = torch.device(cfg.device)
        self.actor = actor_net(cfg.state_n, cfg.action_n, cfg.hidden_n).to(self.device)
        self.critic = critic_net(cfg.state_n, cfg.hidden_n).to(self.device)
        self.actor_optim = optim.Adam(self.actor.parameters(), lr=cfg.actor_lr)
        self.critic_optim = optim.Adam(self.critic.parameters(), lr=cfg.critic_lr)
        self.memory = buffer(cfg.buffer_length)
        self.update_n = cfg.update_n
        self.eps_clip = cfg.eps_clip
        self.entropy_coef = cfg.entropy_coef
        self.sample_count = 0

        self.batch_size = cfg.batch_size

    def sample_action(self, state):
        self.sample_count += 1
        state = torch.tensor(state, device=self.device).unsqueeze(dim=0)
        probs = self.actor(state)
        dist = Categorical(probs)
        action = dist.sample()
        self.log_probs = dist.log_prob(action).detach()
        return action.detach().cpu().numpy().item()

    @torch.no_grad()
    def predict_action(self, state):
        state = torch.tensor(state, device=self.device).unsqueeze(dim=0)
        probs = self.actor(state)
        dist = Categorical(probs)
        action = dist.sample()
        return action.detach().cpu().numpy().item()

    def update(self):
        if self.memory.length() < self.batch_size:
            return
        states, actions, log_probs, rewards, dones = self.memory.sample()
        states = torch.tensor(np.array(states), device=self.device)
        actions = torch.tensor(np.array(actions), device=self.device)
        log_probs = torch.tensor(log_probs, device=self.device)
        returns = []
        discounted_sum = 0
        
        # 蒙特卡洛求优势函数,方差大,时序差分求优势函数,偏差大,一般都采用GAE(蒙特卡洛和时序差分的结合)
        # 磨菇书采用的是蒙特卡洛
        
        for reward, done in zip(reversed(rewards), reversed(dones)):
            if done:
                discounted_sum = 0
            discounted_sum = reward + (self.gamma * discounted_sum)
            returns.insert(0, discounted_sum)
        returns = torch.tensor(returns, device=self.device)
        returns = (returns - returns.mean()) / (returns.std() + 1e-5)

        for _ in range(self.update_n):
            values = self.critic(states)
            advantage = returns - values.detach()
            probs = self.actor(states)
            dist = Categorical(probs)
            new_log_probs = dist.log_prob(actions)
            ratio = torch.exp(new_log_probs - log_probs)
            surr1 = ratio * advantage
            surr2 = torch.clamp(ratio, 1 - self.eps_clip, 1 + self.eps_clip) * advantage
            actor_loss = -torch.min(surr1, surr2).mean() + self.entropy_coef * dist.entropy().mean()
            critic_loss = (returns - values).pow(2).mean()

            self.actor_optim.zero_grad()
            self.critic_optim.zero_grad()
            actor_loss.backward()
            critic_loss.backward()
            self.actor_optim.step()
            self.critic_optim.step()
        self.memory.clear()

6.  定义环境和智能体

def get_env_agent(cfg):
    env = gym.make(cfg.env_name)
    state_n = env.observation_space.shape[0]
    action_n = env.action_space.n
    print('状态空间维度:', state_n)
    print('动作空间维度:', action_n)
    setattr(cfg, 'state_n', state_n)
    setattr(cfg, 'action_n', action_n)
    agent = PPO(cfg)
    return env, agent

7.  定义训练

def train(cfg, env, agent):
    print('train')
    rewards = []
    steps = []
    best_ep_reward = 0
    output_agent = None
    for ep_i in range(cfg.train_eps):
        ep_reward = 0
        ep_step = 0
        state = env.reset(seed = cfg.seed)
        for _ in range(cfg.max_step):
            ep_step += 1
            action = agent.sample_action(state)
            next_state, reward, done, _ = env.step(action)
            agent.memory.push((state, action, agent.log_probs, reward, done))
            state = next_state
            agent.update()
            ep_reward += reward
            if done:
                break
        if (ep_i + 1) % cfg.eval_per_ep == 0:
            sum_eval_reward = 0
            for _ in range(cfg.eval_eps):
                eval_ep_reward = 0
                state = env.reset()
                for _ in range(cfg.max_step):
                    action = agent.predict_action(state)
                    next_state, reward, done, _ = env.step(action)
                    state = next_state
                    eval_ep_reward += reward
                    if done:
                        break
                sum_eval_reward += eval_ep_reward
            mean_eval_reward = sum_eval_reward / cfg.eval_eps
            if mean_eval_reward > best_ep_reward:
                best_ep_reward = mean_eval_reward
                output_agent = copy.deepcopy(agent)
                print('train ep_i:%d/%d, rewards:%f, mean_eval_reward:%f, best_ep_reward:%f, update model'%(ep_i + 1, cfg.train_eps, ep_reward, mean_eval_reward, best_ep_reward))
            else:
                print('train ep_i:%d/%d, rewards:%f, mean_eval_reward:%f, best_ep_reward:%f'%(ep_i + 1, cfg.train_eps, ep_reward, mean_eval_reward, best_ep_reward))
        steps.append(ep_step)
        rewards.append(ep_reward)
    env.close()
    return output_agent, rewards

8.  定义测试

def test(cfg, env, agent):
    print('test')
    rewards = []
    steps = []
    for ep_i in range(cfg.test_eps):
        ep_reward = 0
        ep_step = 0
        state = env.reset()
        for _ in range(cfg.max_step):
            ep_step += 1
            action = agent.predict_action(state)
            next_state, reward, done, _ = env.step(action)
            state = next_state
            ep_reward += reward
            if done:
                break
        steps.append(ep_step)
        rewards.append(ep_reward)
        print('test ep_i:%d, reward:%f'%(ep_i + 1, ep_reward))
    env.close()
    return rewards

9.  定义画图

def smooth(data, weight = 0.9):
    last = data[0]
    smoothed = []
    for point in data:
        smoothed_val = last * weight + (1 - weight) * point
        smoothed.append(smoothed_val)
        last = smoothed_val
    return smoothed

10.  main函数以及运行结果

if __name__ == '__main__':
    cfg = config()
    env, agent = get_env_agent(cfg)
    better_agent, train_rewards = train(cfg, env, agent)
    plt.figure()
    plt.title('training rewards')
    plt.plot(train_rewards, label = 'train_rewards')
    plt.plot(smooth(train_rewards), label = 'train_smooth_rewards')

    test_rewards = test(cfg, env, better_agent)
    plt.figure()
    plt.title('testing rewards')
    plt.plot(test_rewards, label = 'test_rewards')
    plt.plot(smooth(test_rewards), label = 'test_smooth_ewards')
    plt.show()

深度强化学习算法PPO训练CartPole_第1张图片

深度强化学习算法PPO训练CartPole_第2张图片

你可能感兴趣的:(强化学习,算法,人工智能,python,pytorch,深度学习)