Java POI组件——简单提取Word、word转html、text、xml(仅支持doc,不支持docx)

需要添加的库

  • poi-3.15.jar
  • poi-ooxml-3.15.jar
  • poi-scratchpad-3.15.jar
package com.poi.word;

import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.Writer;
import java.util.ArrayList;
import java.util.Date;
import java.util.LinkedList;

import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerConfigurationException;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.TransformerFactoryConfigurationError;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;

import org.apache.poi.POITextExtractor;
import org.apache.poi.hpsf.ClassID;
import org.apache.poi.hpsf.CustomProperties;
import org.apache.poi.hpsf.DocumentSummaryInformation;
import org.apache.poi.hpsf.Property;
import org.apache.poi.hpsf.Section;
import org.apache.poi.hpsf.SummaryInformation;
import org.apache.poi.hpsf.Thumbnail;
import org.apache.poi.hpsf.wellknown.PropertyIDMap;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.converter.AbstractWordConverter;
import org.apache.poi.hwpf.converter.WordToFoConverter;
import org.apache.poi.hwpf.converter.WordToHtmlConverter;
import org.apache.poi.hwpf.converter.WordToTextConverter;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.w3c.dom.Document;

/*
 poi对word的支持比较差,像word转html、text、xml仅支持doc,不支持docx
 */
public class PoiWordClass {
    private static void extract(String path) {
        InputStream is = null;
        WordExtractor extractor = null;
        try {
            is = new FileInputStream(path);
            extractor = new WordExtractor(is);

            System.out.println("\nextractor.getText()");
            System.out.println(extractor.getText());

            System.out.println("\nextractor.getTextFromPieces()");
            System.out.println(extractor.getTextFromPieces());

            System.out.println("\nextractor.getHeaderText()");
            System.out.println(extractor.getHeaderText());

            System.out.println("\nextractor.getFooterText()");
            System.out.println(extractor.getFooterText());

            System.out.println("\nextractor.getCommentsText()");
            String[] commentsText = extractor.getCommentsText();
            for (String str : commentsText) {
                System.out.println(str);
            }

            System.out.println("\nextractor.getEndnoteText()");
            String[] endnoteText = extractor.getEndnoteText();
            for (String str : endnoteText) {
                System.out.println(str);
            }

            System.out.println("\nextractor.getFootnoteText()");
            String[] footnoteText = extractor.getFootnoteText();
            for (String str : footnoteText) {
                System.out.println(str);
            }

            System.out.println("\nextractor.getMainTextboxText()");
            String[] mainTextboxText = extractor.getMainTextboxText();
            for (String str : mainTextboxText) {
                System.out.println(str);
            }

            System.out.println("\nextractor.getParagraphText()");
            String[] paragraphText = extractor.getParagraphText();
            for (String str : paragraphText) {
                System.out.println(str);
            }

            System.out.println("\nextractor.getDocSummaryInformation().toString()");
            DocumentSummaryInformation docSummaryInformation = extractor.getDocSummaryInformation();
            System.out.println(docSummaryInformation.toString());

            System.out.println("\nextractor.getMetadataTextExtractor().toString()");
            POITextExtractor metadataTextExtractor = extractor.getMetadataTextExtractor();
            System.out.println(metadataTextExtractor.getText());

            System.out.println("\nextractor.getSummaryInformation().toString()");
            SummaryInformation summaryInformation = extractor.getSummaryInformation();
            System.out.println(summaryInformation.toString());

            print(docSummaryInformation);
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            if (is != null) {
                try {
                    is.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }
    }

    private static void print(DocumentSummaryInformation docSummaryInformation) {
        int applicationVersion = docSummaryInformation.getApplicationVersion();
        int byteCount = docSummaryInformation.getByteCount();
        int byteOrder = docSummaryInformation.getByteOrder();
        String category = docSummaryInformation.getCategory();
        int charCountWithSpaces = docSummaryInformation.getCharCountWithSpaces();
        Class cls = docSummaryInformation.getClass();
        String company = docSummaryInformation.getCompany();
        String contentStatus = docSummaryInformation.getContentStatus();
        String contentType = docSummaryInformation.getContentType();
        CustomProperties customProperties = docSummaryInformation.getCustomProperties();
        // java.lang.UnsupportedOperationException: Reading byte arrays is not yet implemented.
        // byte[] docparts = docSummaryInformation.getDocparts();
        String documentVersion = docSummaryInformation.getDocumentVersion();
        Section section = docSummaryInformation.getFirstSection();
        int format = docSummaryInformation.getFormat();
        // java.lang.UnsupportedOperationException: Reading byte arrays is not yet implemented.
        // byte[] headingPair = docSummaryInformation.getHeadingPair();
        int hiddenCount = docSummaryInformation.getHiddenCount();
        boolean hyperlinksChanged = docSummaryInformation.getHyperlinksChanged();
        String language = docSummaryInformation.getLanguage();
        int lineCount = docSummaryInformation.getLineCount();
        boolean linksDirty = docSummaryInformation.getLinksDirty();
        String manager = docSummaryInformation.getManager();
        int mmClipCount = docSummaryInformation.getMMClipCount();
        int noteCount = docSummaryInformation.getNoteCount();
        int osVersion = docSummaryInformation.getOSVersion();
        int parCount = docSummaryInformation.getParCount();
        String presentationFormat = docSummaryInformation.getPresentationFormat();
        Property[] properties = docSummaryInformation.getProperties();
        PropertyIDMap propertyIDMap = docSummaryInformation.getPropertySetIDMap();
        boolean scale = docSummaryInformation.getScale();
        int sectionCount = docSummaryInformation.getSectionCount();
        LinkedList
sections = (LinkedList
) docSummaryInformation.getSections(); // org.apache.poi.hpsf.NoSingleSectionException: Property set contains 2 sections. // Section singleSection = docSummaryInformation.getSingleSection(); int slideCount = docSummaryInformation.getSlideCount(); byte[] vbaDigitalSignature = docSummaryInformation.getVBADigitalSignature(); //由于内部是HashMap,故反射无效 // System.out.println("反射测试"); // Field[] fields = cls.getFields(); // System.out.println("fields.length = " + fields.length); // for (int i = 0; i < fields.length; i++) { // if (!fields[i].isAccessible()) { // fields[i].setAccessible(true); // } // try { // System.out.println(fields[i].getName() + " = " + fields[i].get(docSummaryInformation)); // } catch (IllegalArgumentException e) { // e.printStackTrace(); // } catch (IllegalAccessException e) { // e.printStackTrace(); // } // } } private static void print(POITextExtractor metadataTextExtractor) { Class cls = metadataTextExtractor.getClass(); POITextExtractor poiTextExtractor = metadataTextExtractor.getMetadataTextExtractor(); String text = metadataTextExtractor.getText(); } private static void print(SummaryInformation summaryInformation) { String applicationName = summaryInformation.getApplicationName(); String author = summaryInformation.getAuthor(); int byteOrder = summaryInformation.getByteOrder(); int charCount = summaryInformation.getCharCount(); Class cls = summaryInformation.getClass(); ClassID classID = summaryInformation.getClassID(); String comments = summaryInformation.getComments(); Date createDateTime = summaryInformation.getCreateDateTime(); long editTime = summaryInformation.getEditTime(); Section section = summaryInformation.getFirstSection(); int format = summaryInformation.getFormat(); String keywords = summaryInformation.getKeywords(); String lastAuthor = summaryInformation.getLastAuthor(); Date lastPrinted = summaryInformation.getLastPrinted(); Date lastSaveDateTime = summaryInformation.getLastSaveDateTime(); int osVersion = summaryInformation.getOSVersion(); int pageCount = summaryInformation.getPageCount(); Property[] properties = summaryInformation.getProperties(); PropertyIDMap propertySetIDMap = summaryInformation.getPropertySetIDMap(); String recNumber = summaryInformation.getRevNumber(); int sectionCount = summaryInformation.getSectionCount(); ArrayList
sections = (ArrayList
) summaryInformation.getSections(); int security = summaryInformation.getSecurity(); Section singleSection = summaryInformation.getSingleSection(); String subject = summaryInformation.getSubject(); String template = summaryInformation.getTemplate(); byte[] thumbnail = summaryInformation.getThumbnail(); Thumbnail thumbnailThumbnail = summaryInformation.getThumbnailThumbnail(); String title = summaryInformation.getTitle(); int wordCount = summaryInformation.getWordCount(); } enum ConverterType { HTML, TEXT, XML } private static void convert(String srcPath, String destPathWithoutExtension, ConverterType type) { InputStream is = null; Writer writer = null; try { is = new FileInputStream(srcPath); HWPFDocument hwpfDocument = new HWPFDocument(is); Document document = DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument(); AbstractWordConverter converter = null; String method = null; switch (type) { case HTML: converter = new WordToHtmlConverter(document); method = "html"; destPathWithoutExtension += ".html"; break; case TEXT: converter = new WordToTextConverter(document); method = "text"; destPathWithoutExtension += ".txt"; break; case XML: converter = new WordToFoConverter(document); method = "xml"; destPathWithoutExtension += ".xml"; break; } converter.processDocument(hwpfDocument); Transformer transformer = TransformerFactory.newInstance().newTransformer(); writer = new FileWriter(destPathWithoutExtension); transformer.setOutputProperty(OutputKeys.ENCODING, "gbk"); transformer.setOutputProperty(OutputKeys.INDENT, "yes"); transformer.setOutputProperty(OutputKeys.METHOD, method); DOMSource domSource = new DOMSource(converter.getDocument()); StreamResult streamResult = new StreamResult(writer); transformer.transform(domSource, streamResult); } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } catch (ParserConfigurationException e) { e.printStackTrace(); } catch (TransformerConfigurationException e) { e.printStackTrace(); } catch (TransformerFactoryConfigurationError e) { e.printStackTrace(); } catch (TransformerException e) { e.printStackTrace(); } finally { if (is != null) { try { is.close(); } catch (IOException e) { e.printStackTrace(); } } if (writer != null) { try { writer.close(); } catch (IOException e) { e.printStackTrace(); } } } } private static void convertToHtml(String srcPath, String dstPath) { convert(srcPath, dstPath, ConverterType.HTML); } private static void convertToText(String srcPath, String dstPath) { convert(srcPath, dstPath, ConverterType.TEXT); } private static void convertToXml(String srcPath, String dstPath) { convert(srcPath, dstPath, ConverterType.XML); } public static void main(String[] args) { String path = "test.doc"; extract(path); convertToHtml(path, "test"); convertToText(path, "test"); convertToXml(path, "test"); } }

你可能感兴趣的:(——,Java,POI,操作,Office)