用BoW和Bigram特征对电影评论进行情感分类(正/负面),并比较效果。
pip install numpy scikit-learn nltk
使用简单的自定义数据集(实际项目可用IMDB数据集):
# 自定义数据:0为负面,1为正面
texts = [
"I hate this movie", # 0
"This film is terrible", # 0
"I love this wonderful film",# 1
"What a great movie", # 1
]
labels = [0, 0, 1, 1]
from sklearn.feature_extraction.text import CountVectorizer
# 创建BoW向量器
bow_vectorizer = CountVectorizer()
bow_features = bow_vectorizer.fit_transform(texts)
print("BoW特征词表:", bow_vectorizer.get_feature_names_out())
print("BoW特征矩阵:\n", bow_features.toarray())
输出:
BoW特征词表: ['film' 'great' 'hate' 'is' 'love' 'movie' 'terrible' 'this' 'what' 'wonderful']
BoW特征矩阵:
[[0 0 1 0 0 1 0 1 0 0]
[1 0 0 1 0 0 1 1 0 0]
[1 0 0 0 1 0 0 1 0 1]
[0 1 0 0 0 1 0 0 1 0]]
from sklearn.feature_extraction.text import CountVectorizer
# 创建Bigram向量器(N=2)
bigram_vectorizer = CountVectorizer(ngram_range=(2, 2))
bigram_features = bigram_vectorizer.fit_transform(texts)
print("Bigram特征词表:", bigram_vectorizer.get_feature_names_out())
print("Bigram特征矩阵:\n", bigram_features.toarray())
输出:
Bigram特征词表: ['film is' 'hate this' 'is terrible' 'love this' 'terrible this'
'this movie' 'this wonderful' 'what great' 'wonderful film']
Bigram特征矩阵:
[[0 1 0 0 0 1 0 0 0]
[1 0 1 0 0 0 0 0 0]
[0 0 0 1 0 0 1 0 1]
[0 0 0 0 0 0 0 1 0]]
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
# 划分训练集和测试集(此处仅演示,数据量小直接训练)
X_train_bow, X_test_bow = bow_features, bow_features # 实际需划分
X_train_bigram, X_test_bigram = bigram_features, bigram_features
y_train, y_test = labels, labels
# 训练BoW模型
model_bow = MultinomialNB()
model_bow.fit(X_train_bow, y_train)
print("BoW模型准确率:", model_bow.score(X_test_bow, y_test))
# 训练Bigram模型
model_bigram = MultinomialNB()
model_bigram.fit(X_train_bigram, y_train)
print("Bigram模型准确率:", model_bigram.score(X_test_bigram, y_test))
输出:
BoW模型准确率: 1.0
Bigram模型准确率: 1.0
# 自定义数据:0为负面,1为正面
texts = [
"I hate this movie", # 0
"This film is terrible", # 0
"I love this wonderful film",# 1
"What a great movie", # 1
"I dislike this film", # 0
"This movie is amazing", # 1
"I enjoy this film", # 1
"This film is awful", # 0
"I adore this movie", # 1
"This film is fantastic", # 1
"I loathe this movie", # 0
"This movie is boring", # 0
"I appreciate this film", # 1
"This film is dreadful", # 0
"I cherish this movie", # 1
"This film is mediocre", # 0
"I detest this movie", # 0
"This film is superb", # 1
"I value this film", # 1
"This movie is subpar", # 0
"I respect this film", # 1
"This film is excellent", # 1
"I abhor this movie", # 0
"This film is lackluster", # 0
"I admire this film", # 1
"This movie is unsatisfactory", # 0
"I relish this film", # 1
"This film is remarkable", # 1
"I scorn this movie", # 0
"This film is outstanding", # 1
"I disapprove of this film", # 0
"This movie is unremarkable", # 0
"I treasure this film", # 1
"This film is commendable", # 1
"I find this movie distasteful", # 0
"This film is praiseworthy", # 1
"I think this movie is substandard", # 0
"This film is noteworthy", # 1
"I consider this movie to be poor", # 0
"This film is exceptional", # 1
"I feel this movie is inadequate", # 0
"This film is extraordinary", # 1
"I regard this movie as unsatisfactory", # 0
"This film is phenomenal", # 1
"I perceive this movie as disappointing", # 0
"This film is stellar", # 1
"I think this movie is mediocre" # 0
]
labels = [0, 0, 1, 1, 0,
1, 1, 0, 1, 1,
0, 0, 1, 0, 1,
0, 0, 1,1, 0,
1, 1, 0, 0, 1,
0, 1, 1, 0, 1,
0, 0, 1, 1, 0,
1, 0, 1,0,1,0,1,0,1,0,1,0]
print("文本数据:", len(texts), "条")
print("label:", len(labels), "条")
# 导入所需库
from sklearn.feature_extraction.text import CountVectorizer
# 创建BoW向量器
bow_vectorizer = CountVectorizer()
bow_features = bow_vectorizer.fit_transform(texts)
print("BoW特征词表:", bow_vectorizer.get_feature_names_out())
print("BoW特征矩阵:\n", bow_features.toarray())
# 创建Bigram向量器(N=2)
bigram_vectorizer = CountVectorizer(ngram_range=(2, 2))
bigram_features = bigram_vectorizer.fit_transform(texts)
print("Bigram特征词表:", bigram_vectorizer.get_feature_names_out())
print("Bigram特征矩阵:\n", bigram_features.toarray())
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
# 划分训练集和测试集(此处仅演示,数据量小直接训练)
train_test_split = 0.8
train_len = int(len(texts) * train_test_split)
X_train_bow, X_test_bow = bow_features[:train_len], bow_features[train_len:] # 实际需划分
X_train_bigram, X_test_bigram = bigram_features[:train_len], bigram_features[train_len:]
y_train, y_test = labels[:train_len], labels[train_len:]
# 训练BoW模型
model_bow = MultinomialNB()
model_bow.fit(X_train_bow, y_train)
print("BoW模型准确率:", model_bow.score(X_test_bow, y_test))
# 训练Bigram模型
model_bigram = MultinomialNB()
model_bigram.fit(X_train_bigram, y_train)
print("Bigram模型准确率:", model_bigram.score(X_test_bigram, y_test))
"hate"
表示负面,"love"
表示正面)。"terrible this"
可能加强负面判断)。