python nltk库 文本分词,去停用词,词语标准化(词干化,词性还原)

import nltk 
from nltk.corpus import stopwords         #停用词
from nltk.tokenize import word_tokenize   #分词
from nltk.stem import PorterStemmer       #词干化
from nltk.stem import WordNetLemmatizer   #词形还原


stop_words = set(stopwords.words('english'))  #英文停用分词集合
text = "Come on,you are the best!"  #文本
word_tokens = word_tokenize(text)   #分词

#filtered_sentence = list(set(word_tokens)-stop_words) #去掉停用词后 没有重复词的词集合
filtered_sentence = [w for w in word_tokens if w not in stop_words]
print(" ".join(word_tokens)) 
print(" ".join(filtered_sentence)) 

Stem_words = []
ps =PorterStemmer()
for w in filtered_sentence: 
	rootWord=ps.stem(w) 
	stem_words.append(rootWord)
print(filtered_sentence)
print(stem_words)

lemma_word = []
wordnet_lemmatizer = WordNetLemmatizer()
for w in filtered_sentence: 
	word1 = wordnet_lemmatizer.lemmatize(w, pos = "n") 
	word2 = wordnet_lemmatizer.lemmatize(word1, pos = "v") 
	word3 = wordnet_lemmatizer.lemmatize(word2, pos = ("a")) 
	#pos参数 是词性
	lemma_word.append(word3)
print(lemma_word)

python与文本处理相关的库还有:spaCy,Gensim,TextBlob

你可能感兴趣的:(python)