Dom4j解析xml并利用lucene建立引索并搜索
package jim.luceneXML; import java.io.File; import java.io.IOException; import java.util.Iterator; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.Field.Index; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.store.LockObtainFailedException; import org.apache.lucene.util.Version; import org.dom4j.Element; import org.dom4j.io.SAXReader; import org.wltea.analyzer.lucene.IKAnalyzer; //import org.dom4j.Document; import org.dom4j.DocumentException; import org.dom4j.Element; import org.dom4j.io.SAXReader; @SuppressWarnings("unused") public class MyTest { /** * @param args */ public static void main(String[] args) { // TODO Auto-generated method stub try { new MyLucene().creatIndex("test.xml"); } catch (DocumentException e) { // TODO Auto-generated catch block e.printStackTrace(); } MySearch searcher = new MySearch(); searcher.Search(); } } class MyLucene{ Directory directory = null;//用来决定引索目录的储存方式 IndexWriter writer = null;//引索器 Document document = null;//引索文件 Field field = null;//字段信息 IndexWriterConfig iwc = null;//用来选择lucene的版本以及分词器的版本 String indexPath = "index";//引索目录的储存地址 String title = "title";//文件的标题 String content = "content";//文件的内容 String [] files = null;//用来记录文件夹里所有文件的地址 String XmlContent = null;//用来记录从XML文件中读取来的内容 String XmlName = null; //用来记录XML文件的节点名 int num= 0; public void creatIndex(String fileName) throws DocumentException{ //构造器 try { directory = FSDirectory.open(new File(indexPath));//创建directory,其储存方式为在硬盘上储存 iwc = new IndexWriterConfig(Version.LUCENE_35, new IKAnalyzer());//选择lucene的版本以及分词器的版本 writer = new IndexWriter(directory,iwc);//创建引索器 } catch (IOException e) { System.out.println("创建Directory时发生错误!"); // TODO Auto-generated catch block e.printStackTrace(); } SAXReader saxReader = new SAXReader(); org.dom4j.Document doc = saxReader.read(new File(fileName)); //List list = doc.selectNodes("/books/book/title"); Element root = doc.getRootElement(); @SuppressWarnings("rawtypes") Iterator iter = root.elementIterator(); while(iter.hasNext()){ Element rootElement = (Element)iter.next(); @SuppressWarnings("rawtypes") Iterator childElementIter = rootElement.elementIterator(); document = new Document();//创建索引文件 while(childElementIter.hasNext()){ Element childElement = (Element)childElementIter.next(); System.out.println(childElement.getName()+": "+childElement.getText()); XmlContent = childElement.getText(); XmlName = childElement.getName(); if(XmlName.equals("id")){ field = new Field("id",String.valueOf(++num),Field.Store.YES,Index.NOT_ANALYZED); } else if(XmlName.equals("title")){ field = new Field(XmlName,XmlContent,Field.Store.YES,Index.NOT_ANALYZED); } else if(XmlName.equals("keywords")){ field = new Field(XmlName,XmlContent,Field.Store.YES,Index.ANALYZED); } else if(XmlName.equals("kind")){ field = new Field(XmlName,XmlContent,Field.Store.YES,Index.NOT_ANALYZED); } else if(XmlName.equals("describe")){ field = new Field(XmlName,XmlContent,Field.Store.YES,Index.ANALYZED); } else if(XmlName.equals("date")){ field = new Field(XmlName,XmlContent,Field.Store.YES,Index.NOT_ANALYZED); } else if(XmlName.equals("url")){ field = new Field(XmlName,XmlContent,Field.Store.YES,Index.NOT_ANALYZED); } else if(XmlName.equals("author")){ field = new Field(XmlName,XmlContent,Field.Store.YES,Index.NOT_ANALYZED); } else if(XmlName.equals("publisher")){ field = new Field(XmlName,XmlContent,Field.Store.YES,Index.NOT_ANALYZED); } document.add(field); try { writer.addDocument(document); } catch (CorruptIndexException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } } try { writer.close(); } catch (CorruptIndexException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } System.out.println("Index is Created!"); } }
package jim.luceneXML; import java.io.File; import java.io.IOException; import org.apache.lucene.document.Document; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.IndexReader; import org.apache.lucene.queryParser.ParseException; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; import org.wltea.analyzer.lucene.IKAnalyzer; public class MySearch { Directory directory = null;//存储方式 String indexPath = "index";//引索存放的目录 IndexReader reader = null;//读入引索 IndexSearcher searcher = null;//确定搜索对象 QueryParser parser = null;//用于确定搜索时的引索的版本以及分词器 Query query = null;//记录要搜索的词语 TopDocs tds = null;//记录搜索后返回的结果 Document document = null;//存放搜索结果以便于提取结果 ScoreDoc[] sds = null;//存放TopDocs传来的内容(搜索结果) public void Search(){ try { directory = FSDirectory.open(new File(indexPath)); } catch (IOException e) { // TODO Auto-generated catch block System.out.println("创建Directory时发生错误!"); e.printStackTrace(); }//创建directory,其储存方式为在硬盘上储存 try { reader = IndexReader.open(directory); } catch (CorruptIndexException e) { // TODO Auto-generated catch block System.out.println("创建IndexReader时发生错误!"); e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block System.out.println("创建IndexReader时发生错误!"); e.printStackTrace(); } searcher = new IndexSearcher(reader); parser = new QueryParser(Version.LUCENE_35,"keywords",new IKAnalyzer()); try { query = parser.parse("期中"); } catch (ParseException e) { // TODO Auto-generated catch block System.out.println("query = parser.parse(\"keyword\")时发生错误"); e.printStackTrace(); } try { tds = searcher.search(query,10); } catch (IOException e) { System.out.println("std = searcher.search(query,5);时发生错误"); // TODO Auto-generated catch block e.printStackTrace(); } sds = tds.scoreDocs; System.out.println("一共搜索到: "+sds.length+" 条"); if(sds.length != 0){ for( ScoreDoc sd:sds){ try { document = searcher.doc(sd.doc); } catch (CorruptIndexException e) { // TODO Auto-generated catch block System.out.println("document = searcher.doc(sd.doc);时发生错误"); e.printStackTrace(); } catch (IOException e) { System.out.println("document = searcher.doc(sd.doc);时发生错误"); e.printStackTrace(); } String test = document.get("id"); System.out.println(test+document.get("keywords")+document.get("url")); //System.out.println("Id: "+document.get("id")+" name: "+document.get("name")+" number: "+document.get("number")); } } else System.out.println("The word you enter can't be found!"); try { reader.close(); } catch (IOException e) { // TODO Auto-generated catch block System.out.println("关闭reader时发生错误!"); e.printStackTrace(); } System.out.println("Finished"); } public void check() throws IOException{ directory = FSDirectory.open(new File("index")); IndexReader reader = IndexReader.open(directory); for(int i = 0;i<reader.numDocs();i++){ System.out.println(reader.document(i)); } } }
今天对Dom4j解析xml有了一定得了解,并写了一个测试的程序.
今天的不足:
对Dom4j解析xml还有些不完美,做得不够细腻,明天希望这个问题能够改善.