package
com.ljq.analyzer;
import
java.io.StringReader;
import
jeasy.analysis.MMAnalyzer;
import
org.apache.lucene.analysis.Analyzer;
import
org.apache.lucene.analysis.SimpleAnalyzer;
import
org.apache.lucene.analysis.Token;
import
org.apache.lucene.analysis.TokenStream;
import
org.apache.lucene.analysis.cjk.CJKAnalyzer;
import
org.apache.lucene.analysis.standard.StandardAnalyzer;
import
org.junit.Test;
public
class
AnalyzerTest {
String ensaText
=
"
IndexWriter addDocument's a javadoc.txt
"
;
String ensa2Text
=
"
我们是中国人
"
;
String zhcjkaText
=
"
小笑话_总统的房间 Room .txt
"
;
String zhmnText
=
"
一位绅士到旅游胜地的一家饭店要开个房间
"
;
Analyzer ensa
=
new
StandardAnalyzer();
//
单字分词
Analyzer ensa2
=
new
SimpleAnalyzer();
Analyzer zhcjka
=
new
CJKAnalyzer();
//
二分法分词
Analyzer zhmn
=
new
MMAnalyzer();
//
词库分词
@Test
public
void
test()
throws
Exception {
//
单字分词
/*
(indexwriter,0,11,type=<ALPHANUM>)
(adddocument,12,25,type=<APOSTROPHE>)
(javadoc.txt,28,39,type=<HOST>)
*/
//
analyze(ensa, ensaText);
//
(我们是中国人,0,6)
//
analyze(ensa2, ensa2Text);
//
二分法分词
/*
(小笑,0,2,type=double)
(笑话,1,3,type=double)
(_,3,4,type=single)
(总统,4,6,type=double)
(统的,5,7,type=double)
(的房,6,8,type=double)
(房间,7,9,type=double)
(room,10,14,type=single)
(txt,16,19,type=single)
*/
//
analyze(zhcjka, zhcjkaText);
//
词库分词
/*
(一位,0,2)
(绅士,2,4)
(旅游胜地,5,9)
(一家,10,12)
(饭店,12,14)
(要,14,15)
(开个,15,17)
(房间,17,19)
*/
analyze(zhmn, zhmnText);
}
/**
* 分词
*
*
@param
analyzer
* 分词器
*
@param
text
* 数据源
*
@throws
Exception
*/
public
void
analyze(Analyzer analyzer, String text)
throws
Exception {
TokenStream tokenStream
=
analyzer.tokenStream(
"
content
"
,
new
StringReader(text));
for
(Token token
=
new
Token(); (token
=
tokenStream.next(token))
!=
null
;) {
System.out.println(token);
}
}
}