pip install torch transformers datasets pandas tqdm
BERT就像一位读过全互联网的"语言专家":
from datasets import load_dataset
# 加载SST-2情感分析数据集(斯坦福情感树库)
dataset = load_dataset('glue', 'sst2')
print(dataset['train'][0]) # 查看示例
# 输出示例:
# {'sentence': 'a stirring portrait of suffering', 'label': 1, 'idx': 0}
# label=1是正面,0是负面
from transformers import BertTokenizer, BertForSequenceClassification
# 加载BERT的分词器
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# 加载BERT分类模型(2分类)
model = BertForSequenceClassification.from_pretrained(
'bert-base-uncased',
num_labels=2,
output_attentions=False # 不需要注意力权重时可关闭以节省内存
)
# ⚠️ 注意:首次运行会自动下载约400MB的预训练模型
def tokenize_function(examples):
# 对文本进行BERT特有的分词处理
return tokenizer(
examples['sentence'],
padding='max_length',
truncation=True,
max_length=128,
return_tensors="pt"
)
# 应用分词器
tokenized_datasets = dataset.map(tokenize_function, batched=True)
# 重命名标签列
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
# 设置PyTorch格式
tokenized_datasets.set_format("torch",
columns=["input_ids", "attention_mask", "labels"])
from transformers import TrainingArguments, Trainer
import numpy as np
from sklearn.metrics import accuracy_score
# 定义评估指标
def compute_metrics(pred):
labels = pred.label_ids
preds = pred.predictions.argmax(-1)
return {'accuracy': accuracy_score(labels, preds)}
# 训练参数配置
training_args = TrainingArguments(
output_dir='./bert_results', # 输出目录
evaluation_strategy="epoch", # 每轮评估
learning_rate=2e-5, # 小学习率(微调关键!)
per_device_train_batch_size=16,
per_device_eval_batch_size=32,
num_train_epochs=3, # 训练轮数
weight_decay=0.01, # 权重衰减
logging_dir='./logs', # 日志目录
)
# 创建Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_datasets["train"],
eval_dataset=tokenized_datasets["validation"],
compute_metrics=compute_metrics,
)
# 启动训练
trainer.train()
# 保存模型
trainer.save_model("my_bert_sentiment")
项目结构:
bert_sentiment_analysis/
├── train.py # 训练脚本
├── predict.py # 预测脚本
├── requirements.txt
└── README.md
requirements.txt
内容:
torch>=2.0.0
transformers>=4.30.0
datasets>=2.12.0
tqdm>=4.0.0
Epoch Training Loss Validation Accuracy
1 0.324 0.894
2 0.198 0.906
3 0.142 0.912
from transformers import pipeline
# 创建情感分析管道
classifier = pipeline(
"text-classification",
model="my_bert_sentiment",
tokenizer=tokenizer
)
# 测试样例
result = classifier("The movie was a waste of time")
print(result) # [{'label': 'NEGATIVE', 'score': 0.98}]
result = classifier("This is the best film I've ever seen")
print(result) # [{'label': 'POSITIVE', 'score': 0.96}]
优化方案:
bert-base-uncased
代替更大模型fp16=True
)max_length
(如改为64)调优建议:
max_length
(如256)num_train_epochs=5
)解决方案:
bert-base-uncased
的蒸馏版(如DistilBERT)