[原创]java解析非结构化文本 doc xls pdf ppt xml html
在开發中,經常遇到需要用去解析各类非结构化文本,像doc,xls,pdf,ppt,xml,html.
本人在开發站内搜索时,需要加入对各类文件类型的支持,以方便建立索引。解析各类文档调用了几個开源的包。像dom4j-1.6.1.jar,FontBox-0.1.0-dev.jar,htmllexer.jar,htmlparser.jar,PDFBox-0.7.3.jar,poi-3.5-FINAL-20090928.jar,poi-scratchpad-3.5-FINAL-20090928.jar。这些开源的包可以讓我們很方便去解析各类非结构化文本。
jar包的下载地址: http://www.ziddu.com/download/7017588/devlib.rar.html
代码如下:
以上代码比较简单,就不作说明,希望能幫到需要用的朋友。当然上面只是一個简单的例子,如果要具体应用,大家可以自己再改写。如果你有其它的想法,欢迎分享你的精彩想法。
转载请注明出处
本人在开發站内搜索时,需要加入对各类文件类型的支持,以方便建立索引。解析各类文档调用了几個开源的包。像dom4j-1.6.1.jar,FontBox-0.1.0-dev.jar,htmllexer.jar,htmlparser.jar,PDFBox-0.7.3.jar,poi-3.5-FINAL-20090928.jar,poi-scratchpad-3.5-FINAL-20090928.jar。这些开源的包可以讓我們很方便去解析各类非结构化文本。
jar包的下载地址: http://www.ziddu.com/download/7017588/devlib.rar.html
代码如下:
package
com.ducklyl;
import java.io.File;
import java.io.FileInputStream;
import java.util.Iterator;
import org.apache.poi.hslf.model.Slide;
import org.apache.poi.hslf.model.TextRun;
import org.apache.poi.hslf.usermodel.SlideShow;
import org.apache.poi.hssf.usermodel.HSSFCell;
import org.apache.poi.hssf.usermodel.HSSFRow;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.usermodel.Paragraph;
import org.apache.poi.hwpf.usermodel.Range;
import org.dom4j.Document;
import org.dom4j.Element;
import org.dom4j.io.SAXReader;
import org.htmlparser.Parser;
import org.htmlparser.filters. * ;
import org.htmlparser. * ;
import org.htmlparser.nodes.TextNode;
import org.htmlparser.util. * ;
import org.pdfbox.pdfparser.PDFParser;
import org.pdfbox.pdmodel.PDDocument;
import org.pdfbox.util.PDFTextStripper;
public class HandleFile {
public static void main(String args[]){
String str = " e:\\test.HTML " ;
System.out.println(handleFile(str));
}
public static String handleFile(String filename){
String result = "" ;
String fileType = filename.substring(filename.lastIndexOf( " . " ) + 1 , filename.length());
if (fileType.equalsIgnoreCase( " pdf " ))
result = handlePdf(filename);
else if (fileType.equalsIgnoreCase( " xls " ))
result = handleExcel(filename);
else if (fileType.equalsIgnoreCase( " doc " ))
result = handleDoc(filename);
else if (fileType.equalsIgnoreCase( " xml " ))
result = handleXml(filename);
else if (fileType.equalsIgnoreCase( " ppt " ))
result = handlePPT(filename);
else if (fileType.equalsIgnoreCase( " htm " ) || fileType.equalsIgnoreCase( " html " ))
result = handleHtml(filename);
return result;
}
/**
* 解析HTML
* @param filename
* @return
*/
public static String handleHtml(String filename){
String content = "" ;
try {
File file = new File(filename);
if ( ! file.exists()) return content;
Parser parser = new Parser(filename);
parser.setEncoding( " UTF-8 " );
NodeFilter textFilter = new NodeClassFilter(TextNode. class );
NodeList nodes = parser.extractAllNodesThatMatch(textFilter);
for ( int i = 0 ;i < nodes.size();i ++ ){
TextNode textnode = (TextNode)nodes.elementAt(i);
String line = textnode.toPlainTextString().trim();
if (line.equals( "" )) continue ;
content = content + line;
}
} catch (Exception e){
e.printStackTrace();
}
return content;
}
/**
* 解析PPT
* @param filename
* @return
*/
public static String handlePPT(String filename){
StringBuffer content = new StringBuffer( "" );
try {
File file = new File(filename);
if ( ! file.exists()) {
return content.toString();
}
FileInputStream instream = new FileInputStream(file);
SlideShow ppt = new SlideShow(instream);
Slide[] slides = ppt.getSlides();
for ( int i = 0 ;i < slides.length;i ++ ){
TextRun[] t = slides[i].getTextRuns(); // 为了取得幻灯片的文字内容,建立TextRun
for ( int j = 0 ;j < t.length;j ++ ){
content.append(t[j].getText()); // 这里会将文字内容加到content中去
}
content.append(slides[i].getTitle());
}
} catch (Exception e){
e.printStackTrace();
}
return content.toString();
}
/**
* 解析XML
* @param filename
* @return
*/
public static String handleXml(String filename){
String content = "" ,value = "" ,text = "" ;
try {
File file = new File(filename);
if ( ! file.exists()) {
return content;
}
SAXReader saxReader = new SAXReader();
Document document = saxReader.read(file);
Element root = document.getRootElement() ;
Iterator iter = root.elementIterator() ;
while (iter.hasNext()){
Element element = (Element)iter.next();
value = element.getStringValue();
if ( ! value.trim().equals( "" )) content = content + value;
}
} catch (Exception e){
e.printStackTrace();
}
return content;
}
/**
* 解析DOC
* @param filename
* @return
*/
public static String handleDoc(String filename){
String content = "" ;
try {
File file = new File(filename);
if ( ! file.exists()) {
return content;
}
FileInputStream instream = new FileInputStream(file);
HWPFDocument doc = new HWPFDocument(instream);
Range range = doc.getRange();
String text = range.text();
for ( int i = 0 ;i < range.numParagraphs();i ++ ){
Paragraph p = range.getParagraph(i);
content = content + p.text().trim() + " \n " ;
}
} catch (Exception e){
e.printStackTrace();
}
return content;
}
/**
* 解析PDF
* @param filename
* @return
*/
public static String handlePdf(String filename){
String contenttxt = "" ;
try {
File file = new File(filename);
if ( ! file.exists()){
return contenttxt;
}
FileInputStream instream = new FileInputStream(file);
PDFParser parser = new PDFParser(instream);
parser.parse();
PDDocument pdfdocument = parser.getPDDocument();
PDFTextStripper pdfstripper = new PDFTextStripper();
contenttxt = pdfstripper.getText(pdfdocument);
} catch (Exception e){
e.printStackTrace();
}
return contenttxt;
}
/**
* 解析EXCEL
* @param filename
* @return
*/
public static String handleExcel(String filename){
String content = "" ;
try {
File file = new File(filename);
if ( ! file.exists()) {
return content;
}
HSSFWorkbook workbook = new HSSFWorkbook( new FileInputStream(file));
HSSFSheet sheet = workbook.getSheetAt( 0 );
for ( int i = 0 ;i < workbook.getNumberOfSheets();i ++ ){
sheet = workbook.getSheetAt(i);
if (sheet != null ){
for ( int m = 0 ;m < sheet.getLastRowNum();m ++ ){
HSSFRow row = sheet.getRow(m);
if (row == null ) break ;
for ( int n = 0 ;n < row.getLastCellNum();n ++ ){
HSSFCell cell = row.getCell(n);
if (cell == null ) break ;
int type = cell.getCellType();
switch (type){
case 0 :
content = content + cell.getNumericCellValue();
break ;
case 1 :
content = content + cell.getStringCellValue();
break ;
case 3 :
break ;
default :
;
}
}
content = content + " \n " ;
}
}
content = content + " \n " ;
}
} catch (Exception e){
e.printStackTrace();
}
return content;
}
}
不想拷贝的朋友可以直接下载源代码:
http://www.ziddu.com/download/7017614/src.txt.html
import java.io.File;
import java.io.FileInputStream;
import java.util.Iterator;
import org.apache.poi.hslf.model.Slide;
import org.apache.poi.hslf.model.TextRun;
import org.apache.poi.hslf.usermodel.SlideShow;
import org.apache.poi.hssf.usermodel.HSSFCell;
import org.apache.poi.hssf.usermodel.HSSFRow;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.usermodel.Paragraph;
import org.apache.poi.hwpf.usermodel.Range;
import org.dom4j.Document;
import org.dom4j.Element;
import org.dom4j.io.SAXReader;
import org.htmlparser.Parser;
import org.htmlparser.filters. * ;
import org.htmlparser. * ;
import org.htmlparser.nodes.TextNode;
import org.htmlparser.util. * ;
import org.pdfbox.pdfparser.PDFParser;
import org.pdfbox.pdmodel.PDDocument;
import org.pdfbox.util.PDFTextStripper;
public class HandleFile {
public static void main(String args[]){
String str = " e:\\test.HTML " ;
System.out.println(handleFile(str));
}
public static String handleFile(String filename){
String result = "" ;
String fileType = filename.substring(filename.lastIndexOf( " . " ) + 1 , filename.length());
if (fileType.equalsIgnoreCase( " pdf " ))
result = handlePdf(filename);
else if (fileType.equalsIgnoreCase( " xls " ))
result = handleExcel(filename);
else if (fileType.equalsIgnoreCase( " doc " ))
result = handleDoc(filename);
else if (fileType.equalsIgnoreCase( " xml " ))
result = handleXml(filename);
else if (fileType.equalsIgnoreCase( " ppt " ))
result = handlePPT(filename);
else if (fileType.equalsIgnoreCase( " htm " ) || fileType.equalsIgnoreCase( " html " ))
result = handleHtml(filename);
return result;
}
/**
* 解析HTML
* @param filename
* @return
*/
public static String handleHtml(String filename){
String content = "" ;
try {
File file = new File(filename);
if ( ! file.exists()) return content;
Parser parser = new Parser(filename);
parser.setEncoding( " UTF-8 " );
NodeFilter textFilter = new NodeClassFilter(TextNode. class );
NodeList nodes = parser.extractAllNodesThatMatch(textFilter);
for ( int i = 0 ;i < nodes.size();i ++ ){
TextNode textnode = (TextNode)nodes.elementAt(i);
String line = textnode.toPlainTextString().trim();
if (line.equals( "" )) continue ;
content = content + line;
}
} catch (Exception e){
e.printStackTrace();
}
return content;
}
/**
* 解析PPT
* @param filename
* @return
*/
public static String handlePPT(String filename){
StringBuffer content = new StringBuffer( "" );
try {
File file = new File(filename);
if ( ! file.exists()) {
return content.toString();
}
FileInputStream instream = new FileInputStream(file);
SlideShow ppt = new SlideShow(instream);
Slide[] slides = ppt.getSlides();
for ( int i = 0 ;i < slides.length;i ++ ){
TextRun[] t = slides[i].getTextRuns(); // 为了取得幻灯片的文字内容,建立TextRun
for ( int j = 0 ;j < t.length;j ++ ){
content.append(t[j].getText()); // 这里会将文字内容加到content中去
}
content.append(slides[i].getTitle());
}
} catch (Exception e){
e.printStackTrace();
}
return content.toString();
}
/**
* 解析XML
* @param filename
* @return
*/
public static String handleXml(String filename){
String content = "" ,value = "" ,text = "" ;
try {
File file = new File(filename);
if ( ! file.exists()) {
return content;
}
SAXReader saxReader = new SAXReader();
Document document = saxReader.read(file);
Element root = document.getRootElement() ;
Iterator iter = root.elementIterator() ;
while (iter.hasNext()){
Element element = (Element)iter.next();
value = element.getStringValue();
if ( ! value.trim().equals( "" )) content = content + value;
}
} catch (Exception e){
e.printStackTrace();
}
return content;
}
/**
* 解析DOC
* @param filename
* @return
*/
public static String handleDoc(String filename){
String content = "" ;
try {
File file = new File(filename);
if ( ! file.exists()) {
return content;
}
FileInputStream instream = new FileInputStream(file);
HWPFDocument doc = new HWPFDocument(instream);
Range range = doc.getRange();
String text = range.text();
for ( int i = 0 ;i < range.numParagraphs();i ++ ){
Paragraph p = range.getParagraph(i);
content = content + p.text().trim() + " \n " ;
}
} catch (Exception e){
e.printStackTrace();
}
return content;
}
/**
* 解析PDF
* @param filename
* @return
*/
public static String handlePdf(String filename){
String contenttxt = "" ;
try {
File file = new File(filename);
if ( ! file.exists()){
return contenttxt;
}
FileInputStream instream = new FileInputStream(file);
PDFParser parser = new PDFParser(instream);
parser.parse();
PDDocument pdfdocument = parser.getPDDocument();
PDFTextStripper pdfstripper = new PDFTextStripper();
contenttxt = pdfstripper.getText(pdfdocument);
} catch (Exception e){
e.printStackTrace();
}
return contenttxt;
}
/**
* 解析EXCEL
* @param filename
* @return
*/
public static String handleExcel(String filename){
String content = "" ;
try {
File file = new File(filename);
if ( ! file.exists()) {
return content;
}
HSSFWorkbook workbook = new HSSFWorkbook( new FileInputStream(file));
HSSFSheet sheet = workbook.getSheetAt( 0 );
for ( int i = 0 ;i < workbook.getNumberOfSheets();i ++ ){
sheet = workbook.getSheetAt(i);
if (sheet != null ){
for ( int m = 0 ;m < sheet.getLastRowNum();m ++ ){
HSSFRow row = sheet.getRow(m);
if (row == null ) break ;
for ( int n = 0 ;n < row.getLastCellNum();n ++ ){
HSSFCell cell = row.getCell(n);
if (cell == null ) break ;
int type = cell.getCellType();
switch (type){
case 0 :
content = content + cell.getNumericCellValue();
break ;
case 1 :
content = content + cell.getStringCellValue();
break ;
case 3 :
break ;
default :
;
}
}
content = content + " \n " ;
}
}
content = content + " \n " ;
}
} catch (Exception e){
e.printStackTrace();
}
return content;
}
}
以上代码比较简单,就不作说明,希望能幫到需要用的朋友。当然上面只是一個简单的例子,如果要具体应用,大家可以自己再改写。如果你有其它的想法,欢迎分享你的精彩想法。
转载请注明出处