阅读了开源的搜索引擎Lucene的c++版本,这里将笔记比较粗的列举一下,详细的剖析过程,会逐步的整理贴上来.
处理输入:
(1).streambase.h
template <class T>
class StreamBase {};
(2).inputstreambuffer.h
template <class T>
class InputStreamBuffer {};
(3).bufferedstream.h
template <class T>
class BufferedInputStream : public StreamBase<T> {};
(4).fileinputstream.h
class FileInputStream : public BufferedInputStream<char> {};
(5).Reader.h
class Reader;
class StringReader: public Reader {};
class SimpleInputStreamReader: public jstreams::BufferedInputStream<TCHAR>{};
class FileReader: public Reader{};
处理分词:
(1).CharTokenizer是一个抽象类,它主要是对西文字符进行分词处理的.常见的英文中,是以空格、标点为分隔符号的,在分词的时候,就是以这些分隔符
作为分词的间隔符的.
(2).实现CharTokenizer的具体类有3个,分别为:LetterTokenizer、WhitespaceTokenizer;
LetterTokenizer类:只要读取到非字符的符号,就分词;
(3).TokenFilter是一个抽象类,定义了对一个经过分词(Tokenizer)后的TokenStream进行过滤的功能;
(4).不同的Lucene分析器Analyzer,它对TokenStream进行分词的方法是不同的,这需要根据具体的语言来选择。比如英文,一般是通过空格来分割词条,
而中文汉字则不能通过这种方式,最简单的方式就是单个汉字作为一个词条。
TokenStream是通过从设备或者其他地方获取数据源而构造的一个流,我们要执行分词的动作,应该对这个TokenStream进行操作.TokenStream也可以
不是直接通过数据源构造的流,可以是经过分词操作之后读入TokenFilter的一个分词流。
文档与字段:
(1).class Field;
(2).class DocumentFieldEnumeration;
(3).class DocumentFieldList;
(4).class Document;
建立索引辅助工具:
IndexInput.h IndexOutput.h Directory.h FSDirectory.h RAMDirectory.h
class Directory
(1).管理锁工厂及其锁实例;
(2).管理Directory目录实例的基本属性,主要是通过文件名称进行管理;
(3).管理与操作该目录相关的一些流对象;
(4).管理索引文件的拷贝。
class FSDirectory:public Directory
class IndexInput
class MMapIndexInput: public IndexInput
class BufferedIndexInput: public IndexInput
class FSIndexInput:public BufferedIndexInput
class IndexOutput
class BufferedIndexOutput : public IndexOutput
class FSIndexOutput: public BufferedIndexOutput
(1).锁工厂的获取及其管理;
(2).对文件系统目录下索引文件的输入流和输出流的管理;
(3).获取FSDirectory类实例;
(4).获取锁工厂实例后,可以创建一个新的FSDirectory类实例,在此之前先要完成一些初始化工作;
(5).继承自Directory抽象类,自然可以实现索引文件的的拷贝操作。
(6).FSDirectory类中实现了很多静态内部类,这使得只能在FSDirectory类内部访问这些静态类,对外部透明
class RAMFile
class RAMIndexOutput: public BufferedIndexOutput
class RAMIndexInput: public BufferedIndexInput
class RAMLock
因为RAMDirectory是与内存相关的目录,所以它不是永久存在的,不像FSDirectory,所以实例化一个RAMDirectory可以从一个FSDirectory的实例来完成.
RAMDirectory的特点决定了,对目录Directory进行复杂的操作时,都要把这些操作转移到内存中来处理。通过拷贝目录的方式也可以实例化一个RAMdirectory.
将指定的dir目录拷贝到当前的内存中,即实例化一个RAMDirectory。这里,closeDir是一个很重要的状态变量,指定了拷贝完成后,源目录dir是否关闭。
如果实例化一个RAMDirectory完成后就关闭源目录dir,可能会因为处理的时间非常短,而需要再次打开源目录dir,持久化到文件系统目录,开销可能会
比直接操作源目录dir要大,这点要权衡.
public void seek(long pos):输出缓冲区的内容,然后将文件指针定位到long所指示的文件位置.
建立索引:
IndexWriter.h IndexReader.h
class IndexWriter
class IndexReader
class DocumentWriter
class FieldInfos
class FieldsWriter
class FieldsReader
class Term
class TermInfo
class SegmentInfo
class SegmentReader
class SegmentTermPositions
class TermDocs
class TermPositions
class TermInfosReader
class TermInfosWriter
class SegmentInfos
class SegmentMerger
索引的过程:
(1).构造函数
IndexWriter::IndexWriter(const char* path, Analyzer* a, const bool create, const bool _closeDir):
directory( FSDirectory::getDirectory(path, create) ),analyzer(a),egmentInfos (_CLNEW SegmentInfos),
closeDir(_closeDir)
{
_IndexWriter ( create );
}
//create indicates if the indexWriter must create a new index located at path or just open it
void IndexWriter::_IndexWriter(const bool create)
{
//建立一个TransactionalRAMDirectory对象,该对象包含一个事务取消时的删除文件列表及恢复文件列表
//同时还包含一个当前文件列表恢复文件列表自动删除key和value,删除文件列表和当前文件列表则不自动删除
ramDirectory = _CLNEW TransactionalRAMDirectory;
}
(2).Document* doc = FileDocument( path ); ===>doc->add(*_CLNEW Field(_T("path"), tf, Field::STORE_YES | Field::INDEX_UNTOKENIZED ) );
doc->add(*_CLNEW Field(_T("contents"),str.getBuffer(), Field::STORE_YES | Field::INDEX_TOKENIZED) );
(3).writer->addDocument( doc ); ===>一个writer会加入多个doc,因此会有多个segmentName
void IndexWriter::addDocument(Document* doc, Analyzer* analyzer)
{
ramDirectory->transStart();
char* segmentName = newSegmentName();
DocumentWriter* dw = _CLNEW DocumentWriter(ramDirectory, analyzer, this ); (a)
dw->addDocument(segmentName, doc); (b)
//Create a new SegmentInfo instance about this new segment.
SegmentInfo* si = _CLNEW SegmentInfo(segmentName, 1, ramDirectory);
segmentInfos->add(si);
maybeMergeSegments();
}
===>(a).DocumentWriter的构造函数
DocumentWriter::DocumentWriter(CL_NS(store)::Directory* d, CL_NS(analysis)::Analyzer* a, IndexWriter* writer):analyzer(a),
directory(d),maxFieldLength(writer->getMaxFieldLength()),fieldInfos(NULL),fieldLengths(NULL),similarity(writer->getSimilarity()),
termIndexInterval( writer->getTermIndexInterval() ),fieldPositions(NULL),fieldBoosts(NULL),termBuffer(_CLNEW Term){}
===>(b).DocumentWriter::addDocument的方法
void DocumentWriter::addDocument(const char* segment, Document* doc)
{
fieldInfos = _CLNEW FieldInfos();
fieldInfos->add(doc); (c)
fieldInfos->write(directory, buf); (d)
// write field values
FieldsWriter fieldsWriter(directory, segment, fieldInfos); (e)
fieldsWriter.addDocument(doc); (f)
clearPostingTable(); (g) // clear postingTable
size_t size = fieldInfos->size();
invertDocument(doc); //倒排:非常重要
Posting** postings = NULL;
int32_t postingsLength = 0;
//对postingTable中的词条进行排序,返回一个排序的Posting[]数组
sortPostingTable(postings,postingsLength); (h)
//write postings
//将经过排序的Posting[]数组写入到索引段文件中(segmentsv.frq文件和segments.prx文件)
writePostings(postings,postingsLength, segment); (i)
//写入被索引的Field的norm信息
writeNorms(segment); (j)
}
===>(c) FieldInfos::add方法
void FieldInfos::add(const Document* doc)
{
DocumentFieldEnumeration* fields = doc->fields();
Field* field;
while (fields->hasMoreElements()) {
field = fields->nextElement();
add(field->name(), field->isIndexed(), field->isTermVectorStored()); (c.1)
}
_CLDELETE(fields);
}
====>(d) fieldInfos->write方法
void FieldInfos::write(Directory* d, const char* name) const{
IndexOutput* output = d->createOutput(name);
try {
write(output); (d.1)
} _CLFINALLY (
output->close();
_CLDELETE(output);
);
}
====>(d.1) FieldInfos::write方法写入字段信息
void FieldInfos::write(IndexOutput* output) const{
output->writeVInt(size());
FieldInfo* fi;
uint8_t bits;
for (int32_t i = 0; i < size(); ++i) {
fi = fieldInfo(i);
bits = 0x0;
if (fi->isIndexed) bits |= IS_INDEXED; //每个位的位置,如果两个操作数对应的位有一个或者两个都为1,则该位为1,否则为0
if (fi->storeTermVector) bits |= STORE_TERMVECTOR;
if (fi->storePositionWithTermVector) bits |= STORE_POSITIONS_WITH_TERMVECTOR;
if (fi->storeOffsetWithTermVector) bits |= STORE_OFFSET_WITH_TERMVECTOR;
if (fi->omitNorms) bits |= OMIT_NORMS;
output->writeString(fi->name,_tcslen(fi->name));
output->writeByte(bits); //写入一个字节
}
}
====>(e) FieldsWriter fieldsWriter(directory, segment, fieldInfos)构造函数
FieldsWriter::FieldsWriter(Directory* d, const char* segment, FieldInfos* fn):fieldInfos(fn)
{
const char* buf = Misc::segmentname(segment,".fdt");
fieldsStream = d->createOutput (buf);
_CLDELETE_CaARRAY( buf );
buf = Misc::segmentname(segment,".fdx");
indexStream = d->createOutput( buf );
_CLDELETE_CaARRAY( buf );
}
====>(f) fieldsWriter.addDocument(doc) 写入字段值
void FieldsWriter::addDocument(Document* doc)
{
printf("%s=%d","fieldsStream->getFilePointer()",fieldsStream->getFilePointer());
indexStream->writeLong(fieldsStream->getFilePointer());
int32_t storedCount = 0;
DocumentFieldEnumeration* fields = doc->fields();
while (fields->hasMoreElements()) {
Field* field = fields->nextElement();
if (field->isStored())
storedCount++;
}
_CLDELETE(fields);
fieldsStream->writeVInt(storedCount);
fields = doc->fields();
while (fields->hasMoreElements()) {
Field* field = fields->nextElement();
if (field->isStored())
{
fieldsStream->writeVInt(fieldInfos->fieldNumber(field->name())); //写入字段序号
uint8_t bits = 0;
if (field->isTokenized())
bits |= FieldsWriter::FIELD_IS_TOKENIZED;
if (field->isBinary())
bits |= FieldsWriter::FIELD_IS_BINARY;
if (field->isCompressed())
bits |= FieldsWriter::FIELD_IS_COMPRESSED;
fieldsStream->writeByte(bits);
if ( field->isCompressed() ){
_CLTHROWA(CL_ERR_Runtime, "CLucene does not directly support compressed fields. Write a compressed byte array instead");
}else{
// compression is disabled for the current field
if (field->isBinary()) {
jstreams::StreamBase<char>* stream = field->streamValue();
const char* sd;
int32_t rl = stream->read(sd,10000000,0);
if ( rl < 0 ){
fieldsStream->writeVInt(0); //todo: could we detect this earlier and not actually write the field??
}else{
fieldsStream->writeVInt(rl);
fieldsStream->writeBytes((uint8_t*)sd, rl);
}
}else if ( field->stringValue() == NULL ){ //we must be using readerValue
CND_PRECONDITION(!field->isIndexed(), "Cannot store reader if it is indexed too")
Reader* r = field->readerValue();
const TCHAR* rv;
int64_t rl = r->read(rv, LUCENE_INT32_MAX_SHOULDBE);
if ( rl > LUCENE_INT32_MAX_SHOULDBE )
_CLTHROWA(CL_ERR_Runtime,"Field length too long");
else if ( rl < 0 )
rl = 0;
fieldsStream->writeString( rv, (int32_t)rl);
}else if ( field->stringValue() != NULL ){
fieldsStream->writeString(field->stringValue(),_tcslen(field->stringValue()));
}else
_CLTHROWA(CL_ERR_Runtime, "No values are set for the field");
}
}
}
_CLDELETE(fields);
}
====>(g) clearPostingTable
void DocumentWriter::clearPostingTable()
{
PostingTableType::iterator itr = postingTable.begin();
while ( itr != postingTable.end() ){
_CLDELETE(itr->second);
_CLLDECDELETE(itr->first);
++itr;
}
postingTable.clear();
}
====>(h) sortPostingTable(postings,postingsLength)方法
void DocumentWriter::sortPostingTable(Posting**& array, int32_t& arraySize)
{
arraySize = postingTable.size();
array = _CL_NEWARRAY(Posting*,arraySize);
PostingTableType::iterator postings = postingTable.begin();
int32_t i=0;
while ( postings != postingTable.end() )
{
array[i] = (Posting*)postings->second;
postings++;
i++;
}
quickSort(array, 0, i - 1);
}
====> DocumentWriter::writePostings方法
void DocumentWriter::writePostings(Posting** postings, const int32_t postingsLength, const char* segment)
{
}
typedef CL_NS(util)::CLHashtable<Term*,Posting*,Term::Compare, Term::Equals> PostingTableType;
PostingTableType postingTable;
段合并:
class SegmentInfo;
class SegmentInfos;
SegmentInfo* si = _CLNEW SegmentInfo(segmentName, 1, ramDirectory);
segmentInfos->add(si);
maybeMergeSegments();
=====>IndexWriter::maybeMergeSegments合并段的方法
void IndexWriter::maybeMergeSegments()
{
int64_t targetMergeDocs = minMergeDocs;
// find segments smaller than current target size
while (targetMergeDocs <= maxMergeDocs) {
int32_t minSegment = segmentInfos->size();
int32_t mergeDocs = 0;
while (--minSegment >= 0) {
SegmentInfo* si = segmentInfos->info(minSegment);
if (si->docCount >= targetMergeDocs)
break;
mergeDocs += si->docCount;
}
if (mergeDocs >= targetMergeDocs){
// found a merge to do
mergeSegments(minSegment+1); (1)
}else
break;
//increase target size
targetMergeDocs *= mergeFactor;
}
}
=====>(1).IndexWriter::mergeSegments
void IndexWriter::mergeSegments(const uint32_t minSegment, const uint32_t end)
{
CLVector<SegmentReader*> segmentsToDelete(false);
const char* mergedName = newSegmentName();
#ifdef _CL_DEBUG_INFO
fprintf(_CL_DEBUG_INFO, "merging segments/n");
#endif
SegmentMerger merger(this, mergedName);
for (size_t i = minSegment; i < end; i++) {
SegmentInfo* si = segmentInfos->info(i);
SegmentReader* reader = _CLNEW SegmentReader(si); (1)
merger.add(reader);
if ((reader->getDirectory() == this->directory) || // if we own the directory
(reader->getDirectory() == this->ramDirectory)){
segmentsToDelete.push_back(reader); // queue segment for deletion
}
}
int32_t mergedDocCount = merger.merge(); (2)
segmentInfos->clearto(minSegment);// remove old infos & add new
segmentInfos->add( _CLNEW SegmentInfo(mergedName, mergedDocCount, directory) );
// close readers before we attempt to delete now-obsolete segments
merger.closeReaders();
}
=====>(1).SegmentReader* reader = _CLNEW SegmentReader(si)
在构造的方法里面:
调用:
SegmentReader::initialize(SegmentInfo* si)
{
SegmentName(buf, CL_MAX_PATH, ".fnm");
fieldInfos = _CLNEW FieldInfos(cfsDir, buf ); ====>调用FieldInfos::read(IndexInput* input)方法
SegmentName(buf,CL_MAX_PATH, ".frq");
freqStream = cfsDir->openInput( buf );
SegmentName(buf, CL_MAX_PATH,".prx");
proxStream = cfsDir->openInput( buf );
//Instantiate a FieldsReader for reading the Field Info File
fieldsReader = _CLNEW FieldsReader(cfsDir, segment, fieldInfos);
tis = _CLNEW TermInfosReader(cfsDir, segment, fieldInfos);
}
=====>(2)
int32_t SegmentMerger::merge()
{
int32_t value = mergeFields(); (2.1)
mergeTerms(); (2.2)
mergeNorms(); (2.3)
if (fieldInfos->hasVectors())
mergeVectors(); (2.4)
return value;
}
=====>(2.1)
int32_t SegmentMerger::mergeFields()
{
//Create a new FieldInfos
fieldInfos = _CLNEW FieldInfos(); // merge field names
//Condition check to see if fieldInfos points to a valid instance
CND_CONDITION(fieldInfos != NULL,"Memory allocation for fieldInfos failed");
IndexReader* reader = NULL;
int32_t docCount = 0;
//Iterate through all readers
for(uint32_t i = 0; i < readers.size(); i++)
{
//get the i-th reader
reader = readers[i];
//Condition check to see if reader points to a valid instance
CND_CONDITION(reader != NULL,"No IndexReader found");
StringArrayWithDeletor tmp;
tmp.clear();
reader->getFieldNames(IndexReader::TERMVECTOR_WITH_POSITION_OFFSET, tmp);
addIndexed(reader, fieldInfos, tmp, true, true, true);
tmp.clear();
reader->getFieldNames(IndexReader::TERMVECTOR_WITH_POSITION, tmp);
addIndexed(reader, fieldInfos, tmp, true, true, false);
tmp.clear();
reader->getFieldNames(IndexReader::TERMVECTOR_WITH_OFFSET, tmp);
addIndexed(reader, fieldInfos, tmp, true, false, true);
tmp.clear();
reader->getFieldNames(IndexReader::TERMVECTOR, tmp);
addIndexed(reader, fieldInfos, tmp, true, false, false);
tmp.clear();
reader->getFieldNames(IndexReader::INDEXED, tmp);
addIndexed(reader, fieldInfos, tmp, false, false, false);
tmp.clear();
reader->getFieldNames(IndexReader::UNINDEXED, tmp);
if (tmp.size()>0){
TCHAR** arr = _CL_NEWARRAY(TCHAR*,tmp.size()+1);
tmp.toArray(arr);
fieldInfos->add((const TCHAR**)arr, false);
_CLDELETE_ARRAY(arr); //no need to delete the contents, since tmp is responsible for it
}
}
//Create the filename of the new FieldInfos file
const char* buf = Misc::segmentname(segment,".fnm");
//Write the new FieldInfos file to the directory
fieldInfos->write(directory, buf );
//Destroy the buffer of the filename
_CLDELETE_CaARRAY(buf);
// merge field values
//Instantiate Fieldswriter which will write in directory for the segment name segment
//Using the new merged fieldInfos
FieldsWriter* fieldsWriter = _CLNEW FieldsWriter(directory, segment, fieldInfos);
//Condition check to see if fieldsWriter points to a valid instance
CND_CONDITION(fieldsWriter != NULL,"Memory allocation for fieldsWriter failed");
try {
IndexReader* reader = NULL;
int32_t maxDoc = 0;
//Iterate through all readers
for (uint32_t i = 0; i < readers.size(); i++)
{
//get the i-th reader
reader = readers[i];
//Condition check to see if reader points to a valid instance
CND_CONDITION(reader != NULL, "No IndexReader found");
//Get the total number documents including the documents that have been marked deleted
int32_t maxDoc = reader->maxDoc();
//document buffer
Document doc;
//Iterate through all the documents managed by the current reader
for (int32_t j = 0; j < maxDoc; j++){
//Check if the j-th document has been deleted, if so skip it
if (!reader->isDeleted(j)){
//Get the document
if ( reader->document(j, &doc) ){
//Add the document to the new FieldsWriter
fieldsWriter->addDocument( &doc );
docCount++;
//doc is cleard for re-use
doc.clear();
}
}
}
}
}_CLFINALLY(
//Close the fieldsWriter
fieldsWriter->close();
//And have it deleted as it not used any more
_CLDELETE( fieldsWriter );
)
return docCount;
}
=====>void SegmentMerger::createCompoundFile(const char* filename, CL_NS(util)::AStringArrayWithDeletor& files)
{
}
=====>(2.2)
int32_t SegmentMerger::mergeTerms()
{
}
=====>(2.3)
int32_t SegmentMerger::mergeNorms()
{
}
查询索引:
SearchHeader.h
struct ScoreDoc;
class TopDocs;
class HitCollector;
class Weight;
class HitDoc;
class Hits;
class Query;
class Searchable;
class Searcher:public Searchable;
IndexSearcher.h
class IndexSearcher:public Searcher;
QueryParserBase.h
class QueryParserBase;
QueryParser.h
class QueryParser : public QueryParserBase;
IndexSearcher.cpp
===============================================================================================================
STRCPY_AtoT(tline,line,80);
Query* q=QueryParser::parse(tline,_T("contents"),&analyzer); (1)
=====>(1).QueryParser::parse过程静态方法
Query* QueryParser::parse(const TCHAR* query, const TCHAR* field, Analyzer* analyzer)
{
QueryParser parser(field, analyzer); (2)
return parser.parse(query);
}
=====>(2).QueryParser::parse方法
Query* QueryParser::parse(Reader* reader)
{
TokenList _tokens;
this->tokens = &_tokens;
//Instantiate a lexer
Lexer lexer(this, reader);
lexer.Lex(tokens);
Query* ret = MatchQuery(field); (3)
this->tokens = NULL;
}
Hits* search(Query* query)
{
return search(query, (Filter*)NULL ); =====>(4)
}
=====>(4).Hits* search方法
Hits* search(Query* query, Filter* filter)
{
return _CLNEW Hits(this, query, filter);
}
Hits.cpp
Hits::Hits(Searcher* s, Query* q, Filter* f, const Sort* _sort):
query(q), searcher(s), filter(f), sort(_sort)
{
_length = 0;
first = NULL;
last = NULL;
numDocs = 0;
maxDocs = 200;
getMoreDocs(50); (5)
}
=====>(5).Hits::getMoreDocs方法
void Hits::getMoreDocs(const size_t m)
{
size_t _min = m;
{
size_t nHits = hitDocs.size();
if ( nHits > _min)
_min = nHits;
}
size_t n = _min * 2; // double # retrieved
TopDocs* topDocs = NULL;
if ( sort==NULL )
topDocs = (TopDocs*)((Searchable*)searcher)->_search(query, filter, n);
else
topDocs = (TopDocs*)((Searchable*)searcher)->_search(query, filter, n, sort);
_length = topDocs->totalHits;
ScoreDoc* scoreDocs = topDocs->scoreDocs;
int32_t scoreDocsLength = topDocs->scoreDocsLength;
float_t scoreNorm = 1.0f;
//Check that scoreDocs is a valid pointer before using it
if (scoreDocs != NULL)
{
if (_length > 0 && scoreDocs[0].score > 1.0f)
{
scoreNorm = 1.0f / scoreDocs[0].score;
}
int32_t end = scoreDocsLength < _length ? scoreDocsLength : _length;
for(int32_t i = hitDocs.size(); i < end; i++)
{
hitDocs.push_back(_CLNEW HitDoc(scoreDocs[i].score*scoreNorm, scoreDocs[i].doc));
}
}
_CLDELETE(topDocs);
}
=====>(6).Hits::doc方法
Document& Hits::doc(const int32_t n)
{
HitDoc* hitDoc = getHitDoc(n);
//Update LRU cache of documents
remove(hitDoc); // remove from list, if there
addToFront(hitDoc); // add to front of list
if (numDocs > maxDocs)
{ // if cache is full
HitDoc* oldLast = last;
remove(last); // flush last
_CLDELETE( oldLast->doc );
oldLast->doc = NULL;
}
if (hitDoc->doc == NULL)
{
hitDoc->doc = _CLNEW Document;
searcher->doc(hitDoc->id, hitDoc->doc); // cache miss: read document
}
return *hitDoc->doc;
}