使用tm-extractors-0.4.jar来读取word文件

使用tm-extractors-0.4.jar来读取word文件

package searchfileexample;

import javax.servlet.*;
import javax.servlet.http.*;
import java.io.*;
import java.util.*;
import org.textmining.text.extraction.WordExtractor;

public class ReadWord extends HttpServlet {
  private static final String CONTENT_TYPE = "text/html; charset=GBK";

  //Initialize global variables
  public void init() throws ServletException {
  }

  //Process the HTTP Get request
  public void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException {
    response.setContentType(CONTENT_TYPE);
    FileInputStream in = new FileInputStream ("D:/lfy_programe/全文检索/SearchFileExample/a/aa.doc");
       //  FileInputStream in = new FileInputStream ("D:/szqxjzhbase/技术测试/新建 Microsoft Word 文档.doc");
   WordExtractor extractor = new WordExtractor();
   System.out.println(in.available());
  String str = null;
  try {
    str = extractor.extractText(in);
  }
  catch (Exception ex) {
  }
//    System.out.println("the result length is"+str.length());
   System.out.println(str);

  }

  //Clean up resources
  public void destroy() {
  }
}

你可能感兴趣的:(使用tm-extractors-0.4.jar来读取word文件)