打印语汇单元组成
输出:
package org.apache.lucene.demo;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Analyzer.ReuseStrategy;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Analyzer.TokenStreamComponents;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.compound.TestCompoundWordTokenFilter.MockRetainAttribute;
import org.apache.lucene.analysis.core.SimpleAnalyzer;
import org.apache.lucene.analysis.core.StopAnalyzer;
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.util.CharTokenizer;
import org.apache.lucene.util.Version;
public class AnalysisTxt {
private static final String[] examples = {
"The quick brown fox jumped over the lazy dog",
"XY&Z Corporation - [email protected]"
};
private static final Analyzer[] analyzers = new Analyzer[] {
new WhitespaceAnalyzer(),
new SimpleAnalyzer(),
new StopAnalyzer(Version.LUCENE_4_10_3),
new StandardAnalyzer(Version.LUCENE_4_10_3)
};
public static void main(String[] args) {
String[] strings = examples;
if (args.length > 0) {
strings = args;
}
for (String text : strings) {
analyze(text);
}
}
private static void analyze(String text) {
System.out.println("Analyzing \"" + text + "\"");
for (Analyzer analyzer : analyzers)
{
String name = analyzer.getClass().getSimpleName();
System.out.println(" " + name + ":");
System.out.print(" ");
try {
AnalyzerUtils.displayTokens(analyzer, text);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
System.out.println("");
}
System.out.println("");
}
}
class AnalyzerUtils {
public static void displayTokens(Analyzer analyzer, String text) throws IOException {
String fieldName = "contens";
Reader reader = new StringReader(text);
TokenStream stream = analyzer.tokenStream(fieldName, reader);
stream.reset();
displayTokens(stream);
stream.close();
}
public static void displayTokens(TokenStream stream) throws IOException {
CharTermAttribute term = stream.addAttribute(CharTermAttribute. class);
PositionIncrementAttribute posIncrAtt = stream.addAttribute(PositionIncrementAttribute. class);
OffsetAttribute offsetAtt = stream.addAttribute(OffsetAttribute. class);
while (stream.incrementToken()) {
System.out.print("[" + term.toString()
+ ", " + posIncrAtt.getPositionIncrement()
+ ", " + offsetAtt.startOffset()
+ " ~ " + offsetAtt.endOffset()
+ "] ");
}
}
}
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Analyzer.ReuseStrategy;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Analyzer.TokenStreamComponents;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.compound.TestCompoundWordTokenFilter.MockRetainAttribute;
import org.apache.lucene.analysis.core.SimpleAnalyzer;
import org.apache.lucene.analysis.core.StopAnalyzer;
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.util.CharTokenizer;
import org.apache.lucene.util.Version;
public class AnalysisTxt {
private static final String[] examples = {
"The quick brown fox jumped over the lazy dog",
"XY&Z Corporation - [email protected]"
};
private static final Analyzer[] analyzers = new Analyzer[] {
new WhitespaceAnalyzer(),
new SimpleAnalyzer(),
new StopAnalyzer(Version.LUCENE_4_10_3),
new StandardAnalyzer(Version.LUCENE_4_10_3)
};
public static void main(String[] args) {
String[] strings = examples;
if (args.length > 0) {
strings = args;
}
for (String text : strings) {
analyze(text);
}
}
private static void analyze(String text) {
System.out.println("Analyzing \"" + text + "\"");
for (Analyzer analyzer : analyzers)
{
String name = analyzer.getClass().getSimpleName();
System.out.println(" " + name + ":");
System.out.print(" ");
try {
AnalyzerUtils.displayTokens(analyzer, text);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
System.out.println("");
}
System.out.println("");
}
}
class AnalyzerUtils {
public static void displayTokens(Analyzer analyzer, String text) throws IOException {
String fieldName = "contens";
Reader reader = new StringReader(text);
TokenStream stream = analyzer.tokenStream(fieldName, reader);
stream.reset();
displayTokens(stream);
stream.close();
}
public static void displayTokens(TokenStream stream) throws IOException {
CharTermAttribute term = stream.addAttribute(CharTermAttribute. class);
PositionIncrementAttribute posIncrAtt = stream.addAttribute(PositionIncrementAttribute. class);
OffsetAttribute offsetAtt = stream.addAttribute(OffsetAttribute. class);
while (stream.incrementToken()) {
System.out.print("[" + term.toString()
+ ", " + posIncrAtt.getPositionIncrement()
+ ", " + offsetAtt.startOffset()
+ " ~ " + offsetAtt.endOffset()
+ "] ");
}
}
}
输出:
Analyzing "The quick brown fox jumped over the lazy dog"
WhitespaceAnalyzer:
[ The, 1, 0 ~ 3 ] [ quick, 1, 4 ~ 9 ] [ brown, 1, 10 ~ 15 ] [ fox, 1, 16 ~ 19 ] [ jumped, 1, 20 ~ 26 ] [ over, 1, 27 ~ 31 ] [ the, 1, 32 ~ 35 ] [ lazy, 1, 36 ~ 40 ] [ dog, 1, 41 ~ 44 ]
SimpleAnalyzer:
[ the, 1, 0 ~ 3 ] [ quick, 1, 4 ~ 9 ] [ brown, 1, 10 ~ 15 ] [ fox, 1, 16 ~ 19 ] [ jumped, 1, 20 ~ 26 ] [ over, 1, 27 ~ 31 ] [ the, 1, 32 ~ 35 ] [ lazy, 1, 36 ~ 40 ] [ dog, 1, 41 ~ 44 ]
StopAnalyzer:
[ quick, 2, 4 ~ 9 ] [ brown, 1, 10 ~ 15 ] [ fox, 1, 16 ~ 19 ] [ jumped, 1, 20 ~ 26 ] [ over, 1, 27 ~ 31 ] [ lazy, 2, 36 ~ 40 ] [ dog, 1, 41 ~ 44 ]
StandardAnalyzer:
[ quick, 2, 4 ~ 9 ] [ brown, 1, 10 ~ 15 ] [ fox, 1, 16 ~ 19 ] [ jumped, 1, 20 ~ 26 ] [ over, 1, 27 ~ 31 ] [ lazy, 2, 36 ~ 40 ] [ dog, 1, 41 ~ 44 ]
Analyzing "XY&Z Corporation - [email protected]"
WhitespaceAnalyzer:
[ XY&Z, 1, 0 ~ 4 ] [ Corporation, 1, 5 ~ 16 ] [ -, 1, 17 ~ 18 ] [ [email protected], 1, 19 ~ 34 ]
SimpleAnalyzer:
[ xy, 1, 0 ~ 2 ] [ z, 1, 3 ~ 4 ] [ corporation, 1, 5 ~ 16 ] [ xyz, 1, 19 ~ 22 ] [ example, 1, 23 ~ 30 ] [ com, 1, 31 ~ 34 ]
StopAnalyzer:
[ xy, 1, 0 ~ 2 ] [ z, 1, 3 ~ 4 ] [ corporation, 1, 5 ~ 16 ] [ xyz, 1, 19 ~ 22 ] [ example, 1, 23 ~ 30 ] [ com, 1, 31 ~ 34 ]
StandardAnalyzer:
[ xy, 1, 0 ~ 2 ] [ z, 1, 3 ~ 4 ] [ corporation, 1, 5 ~ 16 ] [ xyz, 1, 19 ~ 22 ] [ example.com, 1, 23 ~ 34 ]
WhitespaceAnalyzer:
[ The, 1, 0 ~ 3 ] [ quick, 1, 4 ~ 9 ] [ brown, 1, 10 ~ 15 ] [ fox, 1, 16 ~ 19 ] [ jumped, 1, 20 ~ 26 ] [ over, 1, 27 ~ 31 ] [ the, 1, 32 ~ 35 ] [ lazy, 1, 36 ~ 40 ] [ dog, 1, 41 ~ 44 ]
SimpleAnalyzer:
[ the, 1, 0 ~ 3 ] [ quick, 1, 4 ~ 9 ] [ brown, 1, 10 ~ 15 ] [ fox, 1, 16 ~ 19 ] [ jumped, 1, 20 ~ 26 ] [ over, 1, 27 ~ 31 ] [ the, 1, 32 ~ 35 ] [ lazy, 1, 36 ~ 40 ] [ dog, 1, 41 ~ 44 ]
StopAnalyzer:
[ quick, 2, 4 ~ 9 ] [ brown, 1, 10 ~ 15 ] [ fox, 1, 16 ~ 19 ] [ jumped, 1, 20 ~ 26 ] [ over, 1, 27 ~ 31 ] [ lazy, 2, 36 ~ 40 ] [ dog, 1, 41 ~ 44 ]
StandardAnalyzer:
[ quick, 2, 4 ~ 9 ] [ brown, 1, 10 ~ 15 ] [ fox, 1, 16 ~ 19 ] [ jumped, 1, 20 ~ 26 ] [ over, 1, 27 ~ 31 ] [ lazy, 2, 36 ~ 40 ] [ dog, 1, 41 ~ 44 ]
Analyzing "XY&Z Corporation - [email protected]"
WhitespaceAnalyzer:
[ XY&Z, 1, 0 ~ 4 ] [ Corporation, 1, 5 ~ 16 ] [ -, 1, 17 ~ 18 ] [ [email protected], 1, 19 ~ 34 ]
SimpleAnalyzer:
[ xy, 1, 0 ~ 2 ] [ z, 1, 3 ~ 4 ] [ corporation, 1, 5 ~ 16 ] [ xyz, 1, 19 ~ 22 ] [ example, 1, 23 ~ 30 ] [ com, 1, 31 ~ 34 ]
StopAnalyzer:
[ xy, 1, 0 ~ 2 ] [ z, 1, 3 ~ 4 ] [ corporation, 1, 5 ~ 16 ] [ xyz, 1, 19 ~ 22 ] [ example, 1, 23 ~ 30 ] [ com, 1, 31 ~ 34 ]
StandardAnalyzer:
[ xy, 1, 0 ~ 2 ] [ z, 1, 3 ~ 4 ] [ corporation, 1, 5 ~ 16 ] [ xyz, 1, 19 ~ 22 ] [ example.com, 1, 23 ~ 34 ]