1. 传统强化学习的局限性
传统的强化学习(Reinforcement Learning, RL)依赖于预定义的奖励函数(Reward Function),但在复杂任务(如自然语言生成、机器人控制)中,设计精确的奖励函数极为困难。例如:
方法 | 问题描述 | 典型案例 |
---|---|---|
监督微调(SFT) | 无法捕捉复杂偏好,易过拟合标注数据 | GPT-2在对话场景中的不一致性 |
规则系统 | 难以处理开放域问题,维护成本高 | 早期客服机器人的人工规则库 |
纯RL方法 | 奖励函数设计困难,易出现奖励破解 | 文本游戏AI中的分数刷取行为 |
2. 人类反馈的必要性
人类能直观判断输出是否符合期望(如真实性、安全性、流畅性),但将主观判断转化为可计算的奖励函数是一大挑战。RLHF通过直接利用人类反馈,绕过传统奖励函数设计的瓶颈,使模型更贴近人类价值观。
3. 相关技术演进
技术路线 | 核心思想 | 局限性 |
---|---|---|
逆强化学习 (IRL) | 从专家演示中逆向推导奖励函数 | 依赖完整专家轨迹 |
偏好学习 | 通过二元比较学习隐式奖励 | 信息量低、收敛速度慢 |
协同训练 | 人机交互式策略优化 | 实时反馈成本高昂 |
4. RLHF的里程碑突破
RLHF分为三阶段,核心是将人类偏好转化为可优化的目标:
1. 监督微调(Supervised Fine-Tuning, SFT)
2. 奖励建模(Reward Modeling, RM)
成对排序法:将排序转化为概率(如Bradley-Terry模型),最大化偏好对的似然函数。
损失函数示例:
L ( θ ) = − E ( x , y w , y l ) ∼ D [ log σ ( R θ ( x , y w ) − R θ ( x , y l ) ) ] \mathcal{L}(\theta) = -\mathbb{E}_{(x,y_w,y_l)\sim D} \left[ \log \sigma(R_\theta(x,y_w) - R_\theta(x,y_l)) \right] L(θ)=−E(x,yw,yl)∼D[logσ(Rθ(x,yw)−Rθ(x,yl))]
其中 y w y_w yw 为偏好输出, y l y_l yl 为较差输出, R θ R_\theta Rθ 为奖励模型。
3. 强化学习优化(RL Fine-Tuning)
1. 解决复杂目标对齐问题
2. 提升生成质量
3. 数据效率提升
4. 安全性与可控性
1. 人类标注成本高
2. 反馈偏差问题
3. 奖励模型的局限性
4. 训练复杂度与不稳定性
5. 评估困难
RLHF通过将人类偏好融入强化学习框架,显著提升了模型对齐复杂目标的能力,但其成功依赖于高质量标注数据、稳定的训练流程以及对奖励模型的深入理解。随着技术进步,RLHF有望在更多领域实现安全、可控的AI系统部署。
RM训练
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig
class PreferenceDataset(Dataset):
"""人类偏好数据集加载器"""
def __init__(self, data_path, tokenizer, max_length=128):
self.tokenizer = tokenizer
self.max_length = max_length
# 示例数据结构:每行包含prompt, chosen_response, rejected_response
self.data = self.load_data(data_path)
def load_data(self, path):
# 实际应替换为真实数据加载逻辑
return [
{
"prompt": "Explain quantum physics",
"chosen": "Quantum physics studies subatomic particles...",
"rejected": "It's something about tiny invisible things..."
},
# 更多数据样本...
]
def __len__(self):
return len(self.data)
def __getitem__(self, idx):
item = self.data[idx]
# 对chosen和rejected响应分别编码
chosen = self.tokenizer(
item["prompt"] + self.tokenizer.sep_token + item["chosen"],
max_length=self.max_length,
padding="max_length",
truncation=True,
return_tensors="pt"
)
rejected = self.tokenizer(
item["prompt"] + self.tokenizer.sep_token + item["rejected"],
max_length=self.max_length,
padding="max_length",
truncation=True,
return_tensors="pt"
)
return {
"chosen_input_ids": chosen["input_ids"].squeeze(),
"chosen_attention_mask": chosen["attention_mask"].squeeze(),
"rejected_input_ids": rejected["input_ids"].squeeze(),
"rejected_attention_mask": rejected["attention_mask"].squeeze()
}
class RewardModelTrainer:
def __init__(self, model_name="roberta-base", lr=1e-5):
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# 初始化模型
config = AutoConfig.from_pretrained(model_name)
config.num_labels = 1 # 回归任务,输出单个奖励值
self.model = AutoModelForSequenceClassification.from_pretrained(
model_name,
config=config
).to(self.device)
# 冻结底层参数(可选)
for param in self.model.roberta.parameters(): # 根据实际模型结构调整
param.requires_grad = False
# 替换最后的分类层
self.model.classifier = torch.nn.Sequential(
torch.nn.Linear(config.hidden_size, 256),
torch.nn.ReLU(),
torch.nn.Linear(256, 1)
).to(self.device)
self.optimizer = torch.optim.AdamW(self.model.parameters(), lr=lr)
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
def compute_loss(self, chosen_rewards, rejected_rewards):
"""对比损失函数:确保chosen奖励 > rejected奖励"""
# 使用pairwise ranking loss
margin = 1.0 # 控制奖励差异的幅度
return torch.mean(
torch.clamp(rejected_rewards - chosen_rewards + margin, min=0)
)
def train_epoch(self, dataloader):
self.model.train()
total_loss = 0
for batch in dataloader:
# 前向传播计算奖励
chosen_outputs = self.model(
input_ids=batch["chosen_input_ids"].to(self.device),
attention_mask=batch["chosen_attention_mask"].to(self.device)
)
chosen_rewards = chosen_outputs.logits.squeeze()
rejected_outputs = self.model(
input_ids=batch["rejected_input_ids"].to(self.device),
attention_mask=batch["rejected_attention_mask"].to(self.device)
)
rejected_rewards = rejected_outputs.logits.squeeze()
# 计算对比损失
loss = self.compute_loss(chosen_rewards, rejected_rewards)
# 反向传播
self.optimizer.zero_grad()
loss.backward()
torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
self.optimizer.step()
total_loss += loss.item()
return total_loss / len(dataloader)
def evaluate(self, dataloader):
"""计算验证集准确率"""
self.model.eval()
correct = 0
total = 0
with torch.no_grad():
for batch in dataloader:
chosen_rewards = self.model(
input_ids=batch["chosen_input_ids"].to(self.device),
attention_mask=batch["chosen_attention_mask"].to(self.device)
).logits.squeeze()
rejected_rewards = self.model(
input_ids=batch["rejected_input_ids"].to(self.device),
attention_mask=batch["rejected_attention_mask"].to(self.device)
).logits.squeeze()
correct += (chosen_rewards > rejected_rewards).sum().item()
total += len(chosen_rewards)
return correct / total
# 训练流程示例
if __name__ == "__main__":
trainer = RewardModelTrainer(model_name="roberta-base", lr=1e-5)
dataset = PreferenceDataset("data/", trainer.tokenizer)
dataloader = DataLoader(dataset, batch_size=8, shuffle=True)
# 划分训练集和验证集
val_size = int(0.1 * len(dataset))
train_set, val_set = torch.utils.data.random_split(dataset, [len(dataset)-val_size, val_size])
train_loader = DataLoader(train_set, batch_size=8, shuffle=True)
val_loader = DataLoader(val_set, batch_size=8)
for epoch in range(5):
train_loss = trainer.train_epoch(train_loader)
val_acc = trainer.evaluate(val_loader)
print(f"Epoch {epoch+1}:")
print(f" Train Loss: {train_loss:.4f}")
print(f" Val Accuracy: {val_acc:.2%}\n")
# 保存最终模型
torch.save(trainer.model.state_dict(), "reward_model.pth")
RLHF训练大模型
import torch
import torch.nn as nn
from torch.optim import AdamW
from transformers import AutoModelForCausalLM, AutoTokenizer
class RLHFTrainer:
def __init__(self, config):
# 初始化所有组件
self.device = config.device
# 策略模型(待训练)
self.policy = AutoModelForCausalLM.from_pretrained(config.policy_path).to(self.device)
# 参考模型(冻结)
self.ref_model = AutoModelForCausalLM.from_pretrained(config.ref_path).to(self.device)
for param in self.ref_model.parameters():
param.requires_grad = False
# 奖励模型(示例简化版)
self.reward_model = AutoModelForCausalLM.from_pretrained(config.reward_path).to(self.device)
for param in self.reward_model.parameters():
param.requires_grad = False
# 优化器仅更新策略模型
self.optimizer = AdamW(self.policy.parameters(), lr=config.lr)
# 关键超参数
self.kl_coef = config.kl_coef
self.clip_epsilon = config.clip_epsilon
self.gamma = config.gamma
def compute_reward(self, responses):
"""计算奖励分数(简化版,实际需替换为完整RM逻辑)"""
# 获取奖励模型的隐藏状态
with torch.no_grad():
outputs = self.reward_model(responses, output_hidden_states=True)
last_hidden = outputs.hidden_states[-1]
# 简单线性层计算奖励值
reward = torch.mean(last_hidden, dim=1)
return reward.squeeze()
def compute_kl(self, policy_logits, ref_logits):
"""计算策略模型与参考模型的KL散度"""
policy_probs = torch.softmax(policy_logits, dim=-1)
ref_probs = torch.softmax(ref_logits, dim=-1)
kl_div = torch.sum(policy_probs * torch.log(policy_probs / ref_probs), dim=-1)
return torch.mean(kl_div)
def train_step(self, queries):
# 阶段1:生成响应
policy_outputs = self.policy.generate(
queries, max_length=128, do_sample=True, top_k=50
)
with torch.no_grad():
ref_outputs = self.ref_model.generate(
queries, max_length=128, do_sample=True, top_k=50
)
# 阶段2:计算奖励
policy_rewards = self.compute_reward(policy_outputs)
ref_rewards = self.compute_reward(ref_outputs)
# 归一化奖励(提升稳定性)
policy_rewards = (policy_rewards - policy_rewards.mean()) / (policy_rewards.std() + 1e-8)
ref_rewards = (ref_rewards - ref_rewards.mean()) / (ref_rewards.std() + 1e-8)
# 计算优势(核心改进点)
advantages = policy_rewards - ref_rewards
# 阶段3:获取模型输出logits
policy_logits = self.policy(policy_outputs).logits
with torch.no_grad():
ref_logits = self.ref_model(policy_outputs).logits
# 阶段4:计算策略损失
ratio = torch.exp(policy_logits - ref_logits.detach())
clipped_ratio = torch.clamp(ratio, 1 - self.clip_epsilon, 1 + self.clip_epsilon)
policy_loss = -torch.min(ratio * advantages, clipped_ratio * advantages).mean()
# 阶段5:KL散度惩罚
kl_penalty = self.kl_coef * self.compute_kl(policy_logits, ref_logits)
# 总损失
total_loss = policy_loss + kl_penalty
# 阶段6:反向传播
self.optimizer.zero_grad()
total_loss.backward()
torch.nn.utils.clip_grad_norm_(self.policy.parameters(), 1.0)
self.optimizer.step()
return {
"total_loss": total_loss.item(),
"policy_loss": policy_loss.item(),
"kl_penalty": kl_penalty.item(),
"avg_reward": policy_rewards.mean().item()
}
# 配置类示例
class Config:
device = "cuda" if torch.cuda.is_available() else "cpu"
policy_path = "gpt2"
ref_path = "gpt2"
reward_path = "gpt2"
lr = 1e-5
kl_coef = 0.1
clip_epsilon = 0.2
gamma = 1.0
# 训练流程示例
if __name__ == "__main__":
trainer = RLHFTrainer(Config())
tokenizer = AutoTokenizer.from_pretrained("gpt2")
# 模拟训练数据
queries = tokenizer(["Explain quantum physics", "Write a poem about AI"],
return_tensors="pt", padding=True).to(Config.device)
for step in range(100):
metrics = trainer.train_step(queries.input_ids)
print(f"Step {step}:")
print(f" Loss: {metrics['total_loss']:.4f}")
print(f" Reward: {metrics['avg_reward']:.4f}")
print(f" KL: {metrics['kl_penalty']:.4f}\n")
关键代码解析:
模型架构
self.policy = ... # 可训练的策略模型
self.ref_model = ... # 冻结的参考模型
self.reward_model = ... # 提供奖励信号的模型
优势计算
advantages = policy_rewards - ref_rewards
策略损失核心
ratio = torch.exp(policy_logits - ref_logits.detach())
clipped_ratio = torch.clamp(ratio, 1 - self.clip_epsilon, 1 + self.clip_epsilon)
policy_loss = -torch.min(ratio * advantages, clipped_ratio * advantages).mean()
KL散度惩罚
kl_penalty = self.kl_coef * self.compute_kl(policy_logits, ref_logits)
显示控制策略模型的更新幅度
防止与初始策略(参考模型)偏离过多
改进点说明
优势计算简化
直接使用R_policy - R_ref替代传统PPO中的广义优势估计(GAE),因为:
无Critic模型
省略价值函数网络:
动态奖励归一化
policy_rewards = (policy_rewards - mean)/std