以下是Java对几种文本文件内容读取代码。其中,OFFICE文档(WORD,EXCEL)使用了POI控件,PDF使用了PDFBOX控件。
点击这里 查看相关控件的下载地址和配置方法。
WORD
package textReader; import java.io.*; import org.apache.poi.hwpf.extractor.WordExtractor; public class WordReader { public WordReader(){ } /** * @param filePath 文件路径 * @return 读出的Word的内容 */ public String getTextFromWord(String filePath){ String result = null; File file = new File(filePath); try{ FileInputStream fis = new FileInputStream(file); WordExtractor wordExtractor = new WordExtractor(fis); result = wordExtractor.getText(); }catch(FileNotFoundException e){ e.printStackTrace(); }catch(IOException e){ e.printStackTrace(); }; return result; } }
EXCEL
package textReader;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFRow;
import org.apache.poi.hssf.usermodel.HSSFCell;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
public class ExcelReader {
@SuppressWarnings("deprecation")
/**
* @param filePath 文件路径
* @return 读出的Excel的内容
*/
public String getTextFromExcel(String filePath) {
StringBuffer buff = new StringBuffer();
try {
//创建对Excel工作簿文件的引用
HSSFWorkbook wb = new HSSFWorkbook(new FileInputStream(filePath));
//创建对工作表的引用。
for (int numSheets = 0; numSheets < wb.getNumberOfSheets();
numSheets++) {
if (null != wb.getSheetAt(numSheets)) {
HSSFSheet aSheet = wb.getSheetAt(numSheets);
for (int rowNumOfSheet = 0; rowNumOfSheet <=
aSheet.getLastRowNum(); rowNumOfSheet++) {
if (null != aSheet.getRow(rowNumOfSheet)) {
HSSFRow aRow = aSheet.getRow(rowNumOfSheet);
for (int cellNumOfRow = 0; cellNumOfRow <=
aRow.getLastCellNum(); cellNumOfRow++) {
if (null != aRow.getCell(cellNumOfRow)) {
HSSFCell aCell = aRow.getCell(cellNumOfRow);
switch(aCell.getCellType()){
case HSSFCell.CELL_TYPE_FORMULA:
break;
case HSSFCell.CELL_TYPE_NUMERIC:
buff.append(aCell.getNumericCellValue()).append('\t');
break;
case HSSFCell.CELL_TYPE_STRING:
buff.append(aCell.getStringCellValue()).append('\t');
break;
}
}
}
buff.append('\n');
}
}
}
}
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
return buff.toString();
}
}
package textReader; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import org.pdfbox.pdfparser.PDFParser; import org.pdfbox.pdmodel.PDDocument; import org.pdfbox.util.PDFTextStripper; public class PdfReader { public PdfReader(){ } /** * @param filePath 文件路径 * @return 读出的pdf的内容 */ public String getTextFromPdf(String filePath) { String result = null; FileInputStream is = null; PDDocument document = null; try { is = new FileInputStream(filePath); PDFParser parser = new PDFParser(is); parser.parse(); document = parser.getPDDocument(); PDFTextStripper stripper = new PDFTextStripper(); result = stripper.getText(document); } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } finally { if (is != null) { try {is.close();}catch(IOException e){e.printStackTrace();} } if (document != null) { try{document.close();}catch (IOException e){e.printStackTrace();} } } return result; } }
TXT
package textReader; import java.io.*; public class TxtReader { public TxtReader() { } /** * @param filePath 文件路径 * @return 读出的txt的内容 */ public String getTextFromTxt(String filePath) throws Exception { FileReader fr = new FileReader(filePath); BufferedReader br = new BufferedReader(fr); StringBuffer buff = new StringBuffer(); String temp = null; while((temp = br.readLine()) != null){ buff.append(temp + "\r\n"); } br.close(); return buff.toString(); } }
RTF
package textReader; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import javax.swing.text.BadLocationException; import javax.swing.text.DefaultStyledDocument; import javax.swing.text.rtf.RTFEditorKit; public class RtfReader { public RtfReader(){ } /** * @param filePath 文件路径 * @return 读出的rtf的内容 */ public String getTextFromRtf(String filePath) { String result = null; File file = new File(filePath); try { DefaultStyledDocument styledDoc = new DefaultStyledDocument(); InputStream is = new FileInputStream(file); new RTFEditorKit().read(is, styledDoc, 0); result = new String(styledDoc.getText(0,styledDoc.getLength()). getBytes("ISO8859_1")); //提取文本,读取中文需要使用ISO8859_1编码,否则会出现乱码 } catch (IOException e) { e.printStackTrace(); } catch (BadLocationException e) { e.printStackTrace(); } return result; } }
HTML
package textReader; import java.io.*; public class HtmlReader { public HtmlReader() { } /** * @param filePath 文件路径 * @return 获得html的全部内容 */ public String readHtml(String filePath) { BufferedReader br=null; StringBuffer sb = new StringBuffer(); try { br=new BufferedReader(new InputStreamReader( new FileInputStream(filePath), "GB2312")); String temp=null; while((temp=br.readLine())!=null){ sb.append(temp); } } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } return sb.toString(); } /** * @param filePath 文件路径 * @return 获得的html文本内容 */ public String getTextFromHtml(String filePath) { //得到body标签中的内容 String str= readHtml(filePath); StringBuffer buff = new StringBuffer(); int maxindex = str.length() - 1; int begin = 0; int end; //截取>和<之间的内容 while((begin = str.indexOf('>',begin)) < maxindex){ end = str.indexOf('<',begin); if(end - begin > 1){ buff.append(str.substring(++begin, end)); } begin = end+1; }; return buff.toString(); } }注意 :若使用WPS编辑相关文档,会有错误提示,应避免。 错误文本提示如下:
WORD
EXCEL
RTF
顺便说一下,这里为什么会在write出错呢?因为 level是根据{和}来进行自增和自减的,当括号不匹配的时候就会提示该错误。 wps编辑rtf文件在格式上出了问题,{和}不匹配。 但用word或写字板下新建文件,编辑后另存为rtf文件(wps下不支持),用记事本打开可以发现添加了很多格式说明,但是{和}是匹配的,这样才不会报错。下面是具体说明:
http://www.chinaitpower.com/source/jdk122/javax/swing/text/rtf/RTFParser.java.html
(完)
create@2009-08-17