常用文件转换为txt文本

先贴代码:


import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.StringWriter;
import java.io.UnsupportedEncodingException;

import org.apache.poi.hssf.usermodel.*;
import org.textmining.text.extraction.WordExtractor;

public class Doc2Text {

public static String Word2Text(String fileName) {

String str = "";
FileInputStream in = null;
try {
in = new FileInputStream(fileName);
WordExtractor extractor = new WordExtractor();
str = extractor.extractText(in);

} catch(FileNotFoundException e) {
// TODO 自动生成 catch 块
e.printStackTrace();
} catch(Exception e) {
// TODO 自动生成 catch 块
e.printStackTrace();
} finally {
try {
in.close();
} catch(IOException e) {
// TODO 自动生成 catch 块
e.printStackTrace();
}
}
return str;
}

public static String Excel2Text(String filename) {

String text = "";
FileInputStream fis = null;
try {
int sheetnum = 0;
HSSFSheet sheet = null;
HSSFRow row = null;
fis = new FileInputStream(filename);
HSSFWorkbook workbook = new HSSFWorkbook(fis);
for(sheetnum = 0; sheetnum < workbook.getNumberOfSheets(); sheetnum++) {
sheet = workbook.getSheetAt(sheetnum);
int lastrow = 0;
int rownum = 0;
lastrow = sheet.getLastRowNum();
for(rownum = 0; rownum < lastrow; rownum++) {
row = sheet.getRow(rownum);
if(row != null) {
short firstcell = row.getFirstCellNum();
short lastcell = row.getLastCellNum();
short cellnum;
HSSFCell cell = null;
for(cellnum = firstcell; cellnum < lastcell; cellnum++) {
cell = row.getCell(cellnum);
if(cell != null && cell.getCellType() == HSSFCell.CELL_TYPE_NUMERIC)
text = text + cell.getNumericCellValue() + "\t";
else
if(cell != null && cell.getCellType() == HSSFCell.CELL_TYPE_STRING) {
text = text + cell.getStringCellValue() + "\t";
}
}
text = text + "\n";
}
}
}
} catch(Exception e) {
System.out.println(e);
} finally {
try {
fis.close();
} catch(Exception e) {
// TODO: handle exception

}
}
return text;
}

public static String Txt2Text(String fileName) {

StringBuffer buffer = new StringBuffer();
;
BufferedReader reader = null;
InputStream is = null;
try {
is = new FileInputStream(fileName);
String line; // 用来保存每行读取的内容

if(getEncode(is).equals("UTF-8"))
reader = new BufferedReader(new InputStreamReader(is, "utf-8"));
// else if(getEncode(is).equals("Unicode"))
// reader = new BufferedReader(new InputStreamReader(is,"ISO-8859-1"));
else
reader = new BufferedReader(new InputStreamReader(is));

line = reader.readLine(); // 读取第一行
while(line != null) { // 如果 line 为空说明读完了
buffer.append(line); // 将读到的内容添加到 buffer 中
buffer.append("\n"); // 添加换行符
line = reader.readLine(); // 读取下一行
}

} catch(FileNotFoundException e) {
// TODO 自动生成 catch 块
e.printStackTrace();
} catch(IOException e) {
// TODO 自动生成 catch 块
e.printStackTrace();
} finally {
try {
is.close();
reader.close();
} catch(Exception e) {
// TODO: handle exception

}
}
return new String(buffer);
}

private static String getEncode(InputStream fileStream) {

try {
int[] fileFormat = new int[3];
int index = 0;
int data = fileStream.read();
while(data != -1) {
fileFormat[index] = data;
// System.out.println("data = "+data);
index++;
if(index >= 3)
break;
data = fileStream.read();
}

if(fileFormat[0] == 239 && fileFormat[1] == 187 && fileFormat[2] == 191)
return "UTF-8";
// else if(fileFormat[0]==255 && fileFormat[1]==254)
// return "Unicode";
// else
return "default";
} catch(IOException e) {
// TODO 自动生成 catch 块
e.printStackTrace();
return "default";
}
}

public String Pdf2Text(String fileName) {//未测试成功,有BUG

String ts = "";
BufferedInputStream bis = null;
InputStreamReader reader = null;
try {
String PATH_TO_XPDF = "D:\\pdftotext.exe";
String[] cmd = new String[] {PATH_TO_XPDF, "-enc", "UTF-8", "-q", fileName, "-"};
Process p = Runtime.getRuntime().exec(cmd);
bis = new BufferedInputStream(p.getInputStream());
reader = new InputStreamReader(bis, "UTF-8");
StringWriter out = new StringWriter();
char[] buf = new char[10000];
int len;
while((len = reader.read(buf)) >= 0) {
// out.write(buf, 0, len);
System.out.println("the length is" + len);
}
reader.close();
ts = new String(buf);
// System.out.println("the str is"+ts);
} catch(UnsupportedEncodingException e) {
// TODO 自动生成 catch 块
e.printStackTrace();
} catch(IOException e) {
// TODO 自动生成 catch 块
e.printStackTrace();
} finally {
try {
bis.close();
reader.close();
} catch(Exception e) {
// TODO: handle exception

}
}
return ts;
}

public static void main(String[] args) {

Doc2Text d = new Doc2Text();
System.out.println(d.Txt2Text("D:\\wap经验总结1.txt"));
// System.out.println(d.Txt2Text("D:\\WAP技术参考.doc"));
// System.out.println(d.Excel2Text("D:\\测试.xls"));
}
}

 注:上面需要加入相关的jar包:poi-3.0-alpha3-20061212.jar,tm-extractors-0.4.jar;

 

另外:对于word中的图片直接丢弃,表格线等显示有可能为乱码;欢迎各位大大批评指正,为谢!

你可能感兴趣的:(apache,WAP)