内容提要:以ChineseAnalyzer为例,简单讲讲lucene分析器,也就是analyzer的分析过程
一:分析器原理
语料——>过滤器过滤——>tokeniner分词器分词——>词元——>放进字典(记录词元和位置信息)
二:代码分析
1:一共有5个类,第一个是ChineseAnalyzer分析器类,还有ChineseFilter过滤器类和它的工厂类,和ChineseTokenizer类和它的工厂类
2:ChineseAnalyzer类
public final class ChineseAnalyzer extends Analyzer { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { final Tokenizer source = new ChineseTokenizer(reader);//new一个tokenizer return new TokenStreamComponents(source, new ChineseFilter(source));//把tokonizer和过滤器放入语汇流处理器组建中 } }
3:ChineseFilter类,默认按照空格来切割文档字词,主要处理停用词,和把英文字符长度为1的去掉
public final class ChineseFilter extends TokenFilter { // Only English now, Chinese to be added later.停用词,可以添加在这里 public static final String[] STOP_WORDS = { "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with" }; private CharArraySet stopTable; private CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); public ChineseFilter(TokenStream in) { super(in); stopTable = new CharArraySet(Version.LUCENE_CURRENT, Arrays.asList(STOP_WORDS), false); } @Override public boolean incrementToken() throws IOException { while (input.incrementToken()) { char text[] = termAtt.buffer();//以空格为截断符截取出来的的字符数组 int termLength = termAtt.length(); //过滤器的主要功能,字符是先按照空格截取后的字符数组,先判断是不是在停用词里面,然后判断是不是英文字母,在判断是不是其他字符 // why not key off token type here assuming ChineseTokenizer comes first? if (!stopTable.contains(text, 0, termLength)) {//是不是在停用词里面 switch (Character.getType(text[0])) { case Character.LOWERCASE_LETTER://是不是引文字母 case Character.UPPERCASE_LETTER: // English word/token should larger than 1 character. if (termLength>1) {//要是英文字母,且长度大于1才回返回给语汇处理器 return true; } break; case Character.OTHER_LETTER://要是其他字符,直接返回 // One Chinese character as one Chinese word. // Chinese word extraction to be added later here. return true; } } } return false; } }4:ChineseTokenizer类,是处理分词的
public final class ChineseTokenizer extends Tokenizer {
public ChineseTokenizer(Reader in) {
super(in);
}
public ChineseTokenizer(AttributeSource source, Reader in) {
super(source, in);
}
public ChineseTokenizer(AttributeFactory factory, Reader in) {
super(factory, in);
}
private int offset = 0, bufferIndex=0, dataLen=0;
private final static int MAX_WORD_LEN = 255;
private final static int IO_BUFFER_SIZE = 1024;
private final char[] buffer = new char[MAX_WORD_LEN];
private final char[] ioBuffer = new char[IO_BUFFER_SIZE];
private int length;
private int start;
//处理后的词元写进这两个属性,一个记录词元,一个是记录位置信息
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);//记录词元
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);//记录位置信息
//本地的一个写缓冲区,要是英文就先写到这里,写完一个英文单词再写到termAtt
private final void push(char c) {
if (length == 0) start = offset-1; // start of token
buffer[length++] = Character.toLowerCase(c); // buffer it
}
//把词元和词元的位置信息写到字典,返回true是表示还有词需要继续处理,返回false表示此次输入的文档处理完毕
private final boolean flush() {
//length是指写入词典的词元的长度
if (length>0) {
//System.out.println(new String(buffer, 0,
//length));
termAtt.copyBuffer(buffer, 0, length);
offsetAtt.setOffset(correctOffset(start), correctOffset(start+length));
return true;
}
else
return false;
}
@Override
public boolean incrementToken() throws IOException {
clearAttributes();
//写完一个词元后,长度清零,新词元的起始位置从上一个词元的最后位置开始
length = 0;//重置length
start = offset;//把上一次的偏移量赋值成这一次的起始值
while (true) {
final char c;
offset++;
//将输入流ioBuffer读出来,当bufferIndex>=dataLen的时候,也就是一个输入流被处理完的时候
//再读ioBuffer,dateLen就会等于-1,也就是input.read(ioBuffer)=-1
if (bufferIndex >= dataLen) {
dataLen = input.read(ioBuffer);
bufferIndex = 0;
}
//如果dataLen等于-1,length是等于0的,进入flush,会直接返回false,就是该次输入的文档分析结束
if (dataLen == -1){
offset--;
return flush();
}else c = ioBuffer[bufferIndex++];//取出输入流的字符
switch(Character.getType(c)) {//如果是数字和字母,就写入本地缓存,然后处理下一个字符,如果等于最大长度了,直接写入
case Character.DECIMAL_DIGIT_NUMBER:
case Character.LOWERCASE_LETTER:
case Character.UPPERCASE_LETTER:
push(c);
if (length == MAX_WORD_LEN)return flush();
break;
//如果是其他符号,要是有本地缓存,就先写本地缓存,再后退一次(避免数字,字母和其他字符变成一个词元写入),要是没有本地缓存就直接写入,
//保证了数字和字母结束后遇到其他符号,可以吧数字和字符完整写入,和其他字符也能正常写入
case Character.OTHER_LETTER:
if (length>0) {
bufferIndex--;
offset--;
return flush();
}
push(c);
return flush();
default:
if (length>0) return flush();
break;
}
}
}
@Override
public final void end() {
// set final offset
final int finalOffset = correctOffset(offset);
this.offsetAtt.setOffset(finalOffset, finalOffset);
}
@Override
public void reset() throws IOException {
super.reset();
offset = bufferIndex = dataLen = 0;
}
}