[原创]java解析非结构化文本 doc xls pdf ppt xml html

[原创]java解析非结构化文本 doc xls pdf ppt xml html
    在开發中,經常遇到需要用去解析各类非结构化文本,像doc,xls,pdf,ppt,xml,html.
    本人在开發站内搜索时,需要加入对各类文件类型的支持,以方便建立索引。解析各类文档调用了几個开源的包。像dom4j-1.6.1.jar,FontBox-0.1.0-dev.jar,htmllexer.jar,htmlparser.jar,PDFBox-0.7.3.jar,poi-3.5-FINAL-20090928.jar,poi-scratchpad-3.5-FINAL-20090928.jar。这些开源的包可以讓我們很方便去解析各类非结构化文本。

jar包的下载地址:  http://www.ziddu.com/download/7017588/devlib.rar.html

代码如下:
package  com.ducklyl;

import  java.io.File;
import  java.io.FileInputStream;
import  java.util.Iterator;
import  org.apache.poi.hslf.model.Slide;
import  org.apache.poi.hslf.model.TextRun;
import  org.apache.poi.hslf.usermodel.SlideShow;
import  org.apache.poi.hssf.usermodel.HSSFCell;
import  org.apache.poi.hssf.usermodel.HSSFRow;
import  org.apache.poi.hssf.usermodel.HSSFSheet;
import  org.apache.poi.hssf.usermodel.HSSFWorkbook;
import  org.apache.poi.hwpf.HWPFDocument;
import  org.apache.poi.hwpf.usermodel.Paragraph;
import  org.apache.poi.hwpf.usermodel.Range;
import  org.dom4j.Document;
import  org.dom4j.Element;
import  org.dom4j.io.SAXReader;
import  org.htmlparser.Parser;
import  org.htmlparser.filters. * ;

import  org.htmlparser. * ;
import  org.htmlparser.nodes.TextNode;
import  org.htmlparser.util. * ;

import  org.pdfbox.pdfparser.PDFParser;
import  org.pdfbox.pdmodel.PDDocument;
import  org.pdfbox.util.PDFTextStripper;



public   class  HandleFile {
    
public   static   void  main(String args[]){
        String str
= " e:\\test.HTML " ;
        System.out.println(handleFile(str));
    }

    
public   static  String handleFile(String filename){
        String result
= "" ;
        String fileType
= filename.substring(filename.lastIndexOf( " . " ) + 1 , filename.length());
        
if (fileType.equalsIgnoreCase( " pdf " ))
            result
= handlePdf(filename);
        
else   if (fileType.equalsIgnoreCase( " xls " ))
            result
= handleExcel(filename);
        
else   if (fileType.equalsIgnoreCase( " doc " ))
            result
= handleDoc(filename);
        
else   if (fileType.equalsIgnoreCase( " xml " ))
            result
= handleXml(filename);
        
else   if (fileType.equalsIgnoreCase( " ppt " ))
            result
= handlePPT(filename);
        
else   if (fileType.equalsIgnoreCase( " htm " ) || fileType.equalsIgnoreCase( " html " ))
            result
= handleHtml(filename);
        
return  result;
    }
/**
 * 解析HTML
 * 
@param  filename
 * 
@return
 
*/
    
public   static  String handleHtml(String filename){
        String content
= "" ;
        
try {
            File file
= new  File(filename);
            
if ( ! file.exists())  return  content;
            
            Parser parser
= new  Parser(filename);
            parser.setEncoding(
" UTF-8 " );
            NodeFilter textFilter
= new  NodeClassFilter(TextNode. class );
            NodeList nodes
= parser.extractAllNodesThatMatch(textFilter);
            
for ( int  i = 0 ;i < nodes.size();i ++ ){
                TextNode textnode
= (TextNode)nodes.elementAt(i);
                String line
= textnode.toPlainTextString().trim();
                
if (line.equals( "" ))  continue ;
                content
= content + line;
            }
        }
catch (Exception e){
            e.printStackTrace();
        }
        
return  content;
    }
    
/**
     * 解析PPT
     * 
@param  filename
     * 
@return
     
*/
    
public   static  String handlePPT(String filename){
        StringBuffer content 
=   new  StringBuffer( "" );
        
try {
            File file
= new  File(filename);
            
if ( ! file.exists()) {
                
return  content.toString();
            }
            FileInputStream instream
= new  FileInputStream(file);
            SlideShow ppt 
=   new  SlideShow(instream);
            Slide[] slides 
=  ppt.getSlides();
            
for ( int  i = 0 ;i < slides.length;i ++ ){
                TextRun[] t 
=  slides[i].getTextRuns(); // 为了取得幻灯片的文字内容,建立TextRun
                 for ( int  j = 0 ;j < t.length;j ++ ){
                    content.append(t[j].getText());
// 这里会将文字内容加到content中去
                }
                content.append(slides[i].getTitle());
            }
        }
catch (Exception e){
            e.printStackTrace();
        }
        
return  content.toString();
    }
    
/**
     * 解析XML
     * 
@param  filename
     * 
@return
     
*/
    
public   static  String handleXml(String filename){
        String content
= "" ,value = "" ,text = "" ;
        
try {
            File file
= new  File(filename);
            
if ( ! file.exists()) {
                
return  content;
            }
              SAXReader saxReader 
=   new  SAXReader();
              Document document 
=  saxReader.read(file);
              Element root 
=  document.getRootElement() ;
              
              Iterator iter
= root.elementIterator() ;
               
while (iter.hasNext()){
                  Element element
= (Element)iter.next();
                  value
= element.getStringValue();
                  
if ( ! value.trim().equals( "" ))    content = content + value;
               }
        }
catch (Exception e){
                e.printStackTrace();
        }
        
return  content;
    }
    
/**
     * 解析DOC
     * 
@param  filename
     * 
@return
     
*/
    
public   static  String handleDoc(String filename){
        String content
= "" ;
        
try {
            File file
= new  File(filename);
            
if ( ! file.exists()) {
                
return  content;
            }
            FileInputStream instream
= new  FileInputStream(file);
            HWPFDocument doc
= new  HWPFDocument(instream);
            Range range
= doc.getRange();
            String text
= range.text();
            
for ( int  i = 0 ;i < range.numParagraphs();i ++ ){
                Paragraph p
= range.getParagraph(i);
                content
= content + p.text().trim() + " \n " ;
            }
        }
catch (Exception e){
            e.printStackTrace();
        }
        
return  content;
    }
    
/**
     * 解析PDF
     * 
@param  filename
     * 
@return
     
*/
    
public   static  String handlePdf(String filename){
        String contenttxt
= "" ;
        
try {
            File file
= new  File(filename);
            
if ( ! file.exists()){
                
return  contenttxt;
            }
            FileInputStream instream
= new  FileInputStream(file);
            PDFParser parser
= new  PDFParser(instream);
            parser.parse();
            PDDocument pdfdocument
= parser.getPDDocument();
            PDFTextStripper pdfstripper
= new  PDFTextStripper();
            contenttxt
= pdfstripper.getText(pdfdocument);
        }
catch (Exception e){
            e.printStackTrace();
        }
        
return  contenttxt;
    }
    
/**
     * 解析EXCEL
     * 
@param  filename
     * 
@return
     
*/
    
public   static  String handleExcel(String filename){
        String content
= "" ;
        
try {
            File file
= new  File(filename);
            
if ( ! file.exists()) {
                
return  content;
            }
            HSSFWorkbook workbook
= new  HSSFWorkbook( new  FileInputStream(file));
            HSSFSheet sheet
= workbook.getSheetAt( 0 );
            
            
for ( int  i = 0 ;i < workbook.getNumberOfSheets();i ++ ){
                sheet
= workbook.getSheetAt(i);
                
if (sheet != null ){
                    
for ( int  m = 0 ;m < sheet.getLastRowNum();m ++ ){
                        HSSFRow row
= sheet.getRow(m);
                        
if (row == null break ;
                        
                        
for ( int  n = 0 ;n < row.getLastCellNum();n ++ ){
                            HSSFCell cell
= row.getCell(n);
                            
if (cell == null break ;
                            
int  type = cell.getCellType();
                            
switch (type){
                                
case   0 :
                                    content
= content + cell.getNumericCellValue();
                                    
break ;
                                
case   1 :
                                    content
= content + cell.getStringCellValue();
                                    
break ;
                                
case   3 :
                                    
break ;
                                
default :
                                    ;
                            }
                        }
                        content
= content + " \n " ;
                    }
                }
                content
= content + " \n " ;
            }

        }
catch (Exception e){    
            e.printStackTrace();
        }
        
return  content;
    }
}
不想拷贝的朋友可以直接下载源代码: http://www.ziddu.com/download/7017614/src.txt.html

以上代码比较简单,就不作说明,希望能幫到需要用的朋友。当然上面只是一個简单的例子,如果要具体应用,大家可以自己再改写。如果你有其它的想法,欢迎分享你的精彩想法。


转载请注明出处

你可能感兴趣的:([原创]java解析非结构化文本 doc xls pdf ppt xml html)