Lucene索引,查询及高亮显示

本文通过代码简单展示了采用TermQuery和FuzzyLikeThisQuery进行索引查询,并且展示了如何在查询结果中高亮显示匹配的关键字(这在实际使用中是一个很有用的功能)

  1 public class Indexer

  2 {

  3 

  4     /**

  5      * @param args

  6      * @throws IOException

  7      * @throws LockObtainFailedException

  8      * @throws CorruptIndexException

  9      * @throws InvalidTokenOffsetsException 

 10      */

 11     public static void main(String[] args) throws CorruptIndexException,

 12             LockObtainFailedException, IOException, InvalidTokenOffsetsException

 13     {

 14         Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_36);

 15 

 16         IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_36, analyzer);

 17         config.setOpenMode(OpenMode.CREATE_OR_APPEND);

 18         

 19         Directory indexDir = new RAMDirectory();

 20 

 21         /**

 22          * 1. Indexing...

 23          */

 24         IndexWriter writer = new IndexWriter(indexDir, config);

 25 

 26         File docs = new File("D:\\files");

 27 

 28         if (docs.exists() && docs.isDirectory())

 29         {

 30             File[] files = docs.listFiles();

 31             

 32             if (files != null && files.length > 0)

 33             {

 34                 for (File file : files)

 35                 {

 36 //                    •Field.Index.NO 不索引,如果存储选项为YES,一般用于只保存不搜索的字段;

 37 //                    •Field.Index.ANALYZED 分词建索引;

 38 //                    •Field.Index.NOT_ANALYZED 建索引但不分词,字段虽然被索引但是没有任何分析器对字段进行分析,只能整词精确搜索,可保存唯一性字段(例如ID)并用于更新索引

 39                     Document doc = new Document();

 40                     doc.add(new Field("path", file.getAbsolutePath(), Field.Store.YES, Field.Index.NO));

 41                     doc.add(new Field("id", file.getName(), Field.Store.YES, Field.Index.NOT_ANALYZED));

 42                     doc.add(new Field("name", file.getName(), Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS));

 43                     

 44                     doc.add(new Field("size", file.getTotalSpace() + "b", Field.Store.YES, Field.Index.NO));

 45                     

 46                     writer.addDocument(doc);

 47                 }

 48                 

 49                 writer.commit();

 50             }

 51         }

 52         

 53         writer.close(true);

 54         

 55         

 56         /**

 57          * 2. List indexed files ...

 58          */

 59         IndexReader reader = IndexReader.open(indexDir);

 60         IndexSearcher searcher = new IndexSearcher(reader);

 61         

 62         System.out.println("Max doc:" + searcher.maxDoc());

 63         System.out.println("List files below....");

 64         

 65         Document doc = null;

 66         for (int i = 0; i < searcher.maxDoc(); i++)

 67         {

 68             doc = searcher.doc(i);

 69             System.out.println("Doc " + i + " Name: " + doc.get("name") + ", Path: " + doc.get("path") + ", Size: " + doc.get("size"));

 70         }

 71         System.out.println("===================================================================================");

 72         

 73         

 74         /**

 75          * 3.Searching...

 76          */

 77         String id = "we";

 78         // 此处若改为Query queryId = new TermQuery(new Term("id", id));则无法搜索出结果,除非id = "We are young.txt";

 79         Query queryId = new TermQuery(new Term("name", id));

 80         TopDocs hitsForId = searcher.search(queryId, null, 100);

 81         if (hitsForId != null && hitsForId.totalHits > 0)

 82         {

 83             System.out.println("Searched " + hitsForId.totalHits + " docs for id " + id + "...");

 84             

 85             for (int j = 0; j < hitsForId.scoreDocs.length; j++)

 86             {

 87                 System.out.println("Score doc for id " + j + " is " + hitsForId.scoreDocs[j].toString());

 88             }

 89         }

 90         System.out.println("===================================================================================");

 91         

 92         String keyword = "we are yy";

 93         FuzzyLikeThisQuery fuzzyLikeThisQuery = new FuzzyLikeThisQuery(100, analyzer);

 94         fuzzyLikeThisQuery.addTerms(keyword, "name", 0.8F, 0);

 95         

 96         // FuzzyLikeThisQuery不是lucene core自带的查询类,属于contrib的query模块

 97         // 默认情况下QueryScorer的私有成员WeightedSpanTermExtractor无法识别它,getBestFragment将返回null

 98         // 因此此处调用rewrite生成一个WeightedSpanTermExtractor可以识别的query对象,用于匹配内容关键字

 99         Query query = fuzzyLikeThisQuery.rewrite(reader);

100         

101         // 高亮显示关键字,如果内容中本来就有<span></span>,可能导致显示错乱

102         SimpleHTMLFormatter simpleHTMLFormatter = new SimpleHTMLFormatter("<span>", "</span>");

103         Highlighter highlighter = new Highlighter(simpleHTMLFormatter, new QueryScorer(query));  

104         

105         TopDocs hits = searcher.search(fuzzyLikeThisQuery, null, 100);

106         

107         if (hits != null && hits.totalHits > 0)

108         {

109             System.out.println("Searched " + hits.totalHits + "docs for keyword " + keyword + "...");

110             

111             ScoreDoc[] sDocs = hits.scoreDocs;

112             

113             Document docMatched = null;

114             for (int j = 0; j < sDocs.length; j++)

115             {

116                 System.out.println("Score doc " + j + " is " + sDocs[j].toString());

117                 

118                 docMatched = searcher.doc(sDocs[j].doc);

119                 

120                 TokenStream tokenStream = analyzer.tokenStream("name", new StringReader(docMatched.get("name")));  

121                 String str = highlighter.getBestFragment(tokenStream,  docMatched.get("name"));

122                 

123                 System.out.println("Score doc " + j + " hightlight to: " + str);

124 

125             }

126         }

127         

128         reader.close();

129         indexDir.close();

130     }

131 }

 

输出如下

Max doc:13
List files below....
Doc 0 Name: ab.txt, Path: D:\files\ab.txt, Size: 104857595904b
Doc 1 Name: abc.txt, Path: D:\files\abc.txt, Size: 104857595904b
Doc 2 Name: M_1.txt, Path: D:\files\M_1.txt, Size: 104857595904b
Doc 3 Name: M_11.txt, Path: D:\files\M_11.txt, Size: 104857595904b
Doc 4 Name: We are young.txt, Path: D:\files\We are young.txt, Size: 104857595904b
Doc 5 Name: 什么是微博.txt, Path: D:\files\什么是微博.txt, Size: 104857595904b
Doc 6 Name: 喝水不忘挖井人.txt, Path: D:\files\喝水不忘挖井人.txt, Size: 104857595904b
Doc 7 Name: 天苍苍野茫茫.txt, Path: D:\files\天苍苍野茫茫.txt, Size: 104857595904b
Doc 8 Name: 怎么使用lucene.txt, Path: D:\files\怎么使用lucene.txt, Size: 104857595904b
Doc 9 Name: 神马是一种马吗.txt, Path: D:\files\神马是一种马吗.txt, Size: 104857595904b
Doc 10 Name: 苍井.txt, Path: D:\files\苍井.txt, Size: 104857595904b
Doc 11 Name: 苍白 - 副本.txt, Path: D:\files\苍白 - 副本.txt, Size: 104857595904b
Doc 12 Name: 苍白.txt, Path: D:\files\苍白.txt, Size: 104857595904b
===================================================================================
Searched 1 docs for id we...
Score doc for id 0 is doc=4 score=1.7948763 shardIndex=-1
===================================================================================
Searched 1docs for keyword we are yy...
Score doc 0 is doc=4 score=0.625 shardIndex=-1
Score doc 0 hightlight to: <span>We</span> are young.txt

你可能感兴趣的:(Lucene)