通过统计文本中各个词汇的出现频率,找出文本中的关键词,帮助我们了解文本的核心内容。
TfidfVectorizer
:计算TF-IDF值。C:/nltk_data
。punkt
分词器和stopwords
停用词表)。preprocess_text
string.punctuation
移除所有标点。jieba
进行分词,加载外部文件chinese_stopwords.txt
中的中文停用词。word_tokenize
分词,加载NLTK英文停用词。def preprocess_text(text, language='english'):
# 去除标点符号
text = text.translate(str.maketrans('', '', string.punctuation))
if language == 'chinese':
# 中文分词
words = jieba.cut(text)
# 加载中文停用词表
stop_words = set(open('chinese_stopwords.txt', encoding='utf-8').read().split())
else:
# 英文处理
text = text.lower()
words = word_tokenize(text)
stop_words = set(stopwords.words('english'))
# 去除停用词和数字
filtered_words = [word for word in words if
word not in stop_words
and not word.isdigit()
and len(word) > 1]
return filtered_words
preprocess_text
处理后,通过Counter
统计词频。language
, computers
等)。sample_text = """
Natural language processing (NLP) is a subfield of linguistics, computer science,
and artificial intelligence concerned with the interactions between computers
and human language. It focuses on how to program computers to process and
analyze large amounts of natural language data. Key tasks include text
classification, sentiment analysis, machine translation, and speech recognition.
processed_words = preprocess_text(sample_text)
word_freq = Counter(processed_words)
# 输出前10高频词
print("Top 10 TF高频词:")
for word, freq in word_freq.most_common(10):
print(f"{word}: {freq}")
"""
TfidfVectorizer
将文档转换为TF-IDF矩阵。computers
, understand
等)。documents = [
"Natural language processing enables computers to understand human language.",
"Machine learning is a key component of artificial intelligence.",
"Text classification and sentiment analysis are common NLP tasks.",
"Deep learning has revolutionized speech recognition systems."
]
# 计算TF-IDF
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(documents)
feature_names = vectorizer.get_feature_names_out()
# 获取第一个文档的TF-IDF值
tfidf_values = tfidf_matrix[0].toarray().flatten()
tfidf_dict = {word: score for word, score in zip(feature_names, tfidf_values)}
print("\nTF-IDF示例(第一个文档):")
for word, score in sorted(tfidf_dict.items(), key=lambda x: x[1], reverse=True)[:5]:
print(f"{word}: {score:.4f}")
# ====================
# 可视化(词云)
# ====================
wordcloud = WordCloud(width=800, height=400,
background_color='white',
colormap='viridis').generate_from_frequencies(word_freq)
plt.figure(figsize=(12, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Frequency Cloud')
plt.show()
# ====================
# 可视化(柱状图)
# ====================
top_words = 10
words, frequencies = zip(*word_freq.most_common(top_words))
plt.figure(figsize=(12, 6))
plt.barh(range(len(words)), frequencies, color='skyblue')
plt.yticks(range(len(words)), words)
plt.gca().invert_yaxis() # 最高频词显示在最上面
plt.xlabel('Frequency')
plt.title(f'Top {top_words} Most Frequent Words')
plt.tight_layout()
plt.show()
language
参数切换中英文处理(示例仅演示英文)。chinese_stopwords.txt
存在于当前目录且编码为UTF-8,否则中文处理会报错。C:/nltk_data
可能需要管理员权限才能写入。import nltk
nltk.data.path.append('C:/nltk_data')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
import string
import jieba # 中文分词库
# 初始化nltk数据(首次运行需要下载)
nltk.download('punkt', download_dir='C:/nltk_data')
nltk.download('stopwords', download_dir='C:/nltk_data')
# ====================
# 文本预处理函数
# ====================
def preprocess_text(text, language='english'):
# 去除标点符号
text = text.translate(str.maketrans('', '', string.punctuation))
if language == 'chinese':
# 中文分词
words = jieba.cut(text)
# 加载中文停用词表
stop_words = set(open('chinese_stopwords.txt', encoding='utf-8').read().split())
else:
# 英文处理
text = text.lower()
words = word_tokenize(text)
stop_words = set(stopwords.words('english'))
# 去除停用词和数字
filtered_words = [word for word in words if
word not in stop_words
and not word.isdigit()
and len(word) > 1]
return filtered_words
# ====================
# 示例文本
# ====================
sample_text = """
Natural language processing (NLP) is a subfield of linguistics, computer science,
and artificial intelligence concerned with the interactions between computers
and human language. It focuses on how to program computers to process and
analyze large amounts of natural language data. Key tasks include text
classification, sentiment analysis, machine translation, and speech recognition.
"""
# ====================
# 词频统计(TF)
# ====================
processed_words = preprocess_text(sample_text)
word_freq = Counter(processed_words)
# 输出前10高频词
print("Top 10 TF高频词:")
for word, freq in word_freq.most_common(10):
print(f"{word}: {freq}")
# ====================
# TF-IDF计算(示例使用多个文档)
# ====================
documents = [
"Natural language processing enables computers to understand human language.",
"Machine learning is a key component of artificial intelligence.",
"Text classification and sentiment analysis are common NLP tasks.",
"Deep learning has revolutionized speech recognition systems."
]
# 计算TF-IDF
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(documents)
feature_names = vectorizer.get_feature_names_out()
# 获取第一个文档的TF-IDF值
tfidf_values = tfidf_matrix[0].toarray().flatten()
tfidf_dict = {word: score for word, score in zip(feature_names, tfidf_values)}
print("\nTF-IDF示例(第一个文档):")
for word, score in sorted(tfidf_dict.items(), key=lambda x: x[1], reverse=True)[:5]:
print(f"{word}: {score:.4f}")
# ====================
# 可视化(词云)
# ====================
wordcloud = WordCloud(width=800, height=400,
background_color='white',
colormap='viridis').generate_from_frequencies(word_freq)
plt.figure(figsize=(12, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Frequency Cloud')
plt.show()
# ====================
# 可视化(柱状图)
# ====================
top_words = 10
words, frequencies = zip(*word_freq.most_common(top_words))
plt.figure(figsize=(12, 6))
plt.barh(range(len(words)), frequencies, color='skyblue')
plt.yticks(range(len(words)), words)
plt.gca().invert_yaxis() # 最高频词显示在最上面
plt.xlabel('Frequency')
plt.title(f'Top {top_words} Most Frequent Words')
plt.tight_layout()
plt.show()