使用Java基于数据流直接抽取word文本

[代码] [Java]代码

view source
print ?
001 public class WordExtractor {
002   
003     public static StringBuilder logBytes = new StringBuilder();
004   
005     public static String bytesToString(byte[] ogiBytes, int start, int length, int fc)
006     {
007         StringBuilder content = new StringBuilder();
008         byte[] bytes = new byte[length];
009         System.arraycopy(ogiBytes, start, bytes, 0, length);
010         if(fc == 0)
011         {
012             for(int i=0;i<bytes.length;i++)
013             {
014                 if(i == bytes.length - 1)
015                 {
016                     return content.toString();
017                 }
018               
019                 String a = Integer.toHexString(bytes[i+1] & 0xFF);
020                 String b = Integer.toHexString(bytes[i] & 0xFF);
021                 if(a.length() == 1)
022                 {
023                     a = "0"+ a;
024                 }
025               
026                 if(b.length() == 1)
027                 {
028                     b = "0"+ b;
029                 }
030               
031                 String hexStr = a + b;
032                 int ch = Integer.valueOf(hexStr, 16);
033                 content.append( (char)ch );
034                 i++;
035             }
036         }
037         else
038         {
039             for(int i=0;i<bytes.length;i++)
040             {
041                 int ch = bytes[i] & 0xFF;
042                 content.append( (char)ch );
043             }
044         }
045       
046         return content.toString();
047     }
048   
049     public static void bytesToString(byte[] ogiBytes, StringBuilder content, int start, int length, int fc)
050     {
051         content.append( bytesToString(ogiBytes, start, length, fc) );
052     }
053   
054     public static void printLogBytes(List<Byte> legaled) throws Exception
055     {
056         logBytes = new StringBuilder();
057       
058         logBytes.append(" ========================================================");
059         for(int a=0;a<legaled.size();a++)
060         {
061             if(a % 16 == 0)
062             {
063                 logBytes.append(" ");
064             }
065             logBytes.append(Integer.toHexString(legaled.get(a) & 0xFF) +" ");
066         }
067         logBytes.append(" ========================================================");
068       
069         FileUtil.writeAscFile("E:ytes.txt", logBytes.toString());
070     }
071   
072     public static int getOneTable(byte[] ogiBytes, Stream stream, int dirSect1)
073     {
074         for(int i=0;i<8;i++)
075         {
076             int offsetEntry = (dirSect1 + 1)*512 + i*128;
077             StringBuilder content = new StringBuilder();
078             bytesToString(ogiBytes, content, offsetEntry, 64, 0);
079             if(content.toString().indexOf("1Table") > -1)
080             {
081                 return offsetEntry;
082             }
083         }
084       
085         return 0;
086     }
087   
088     public static void main(String[] args) throws Exception
089     {
090         byte[] ogiBytes = FileUtil.readBinFile("D: oolsoletest est-old.doc");
091       
092         System.out.println("Total bytes: "+ ogiBytes.length);
093         if(
094                 ogiBytes.length < 8         ||
095                 (ogiBytes[0] & 0xFF) != 208 ||
096                 (ogiBytes[1] & 0xFF) != 207 ||
097                 (ogiBytes[2] & 0xFF) != 17     ||
098                 (ogiBytes[3] & 0xFF) != 224 ||
099                 (ogiBytes[4] & 0xFF) != 161 ||
100                 (ogiBytes[5] & 0xFF) != 177 ||
101                 (ogiBytes[6] & 0xFF) != 26     ||
102                 (ogiBytes[7] & 0xFF) != 225
103         ){
104             System.out.println("Not the doc file!");
105             return;
106         }
107       
108         StringBuilder content = new StringBuilder();
109       
110         Stream stream = new Stream(ogiBytes);
111         int[] offset = new int[1];
112       
113         offset[0] = 48;
114         int dirSect1 = stream.getInteger(offset);
115         int oneTable = getOneTable(ogiBytes, stream, dirSect1);
116       
117         offset[0] = oneTable + 116;
118         int startSect = stream.getInteger(offset);
119         int tableStream = (startSect + 1)*512;
120       
121         offset[0] = 930;
122         int fcClx = stream.getInteger(offset);
123         if(fcClx == -1)
124         {
125             System.out.println("This version of doc can not be parsed!");
126             return;
127         }
128       
129         int offsetClx = tableStream + fcClx;
130       
131         offset[0] = offsetClx + 1;
132         int lcb = stream.getInteger(offset);
133      
134         int countPcd = (lcb - 4)/12;
135         int countCp = (lcb - countPcd*8)/4;
136         int offsetPlcpcd = offsetClx + 5;
137      
138         for(int i=0;i<countPcd;i++)
139         {
140             int offsetPcd = offsetPlcpcd + countCp*4 + i*8;
141           
142             offset[0] = offsetPcd + 2;
143             int start = stream.getInteger(offset);
144             int fc = start >> 30;
145             start = (start << 2) >> 2;
146      
147             offset[0] = offsetPlcpcd + i*4;
148             int cpPre = stream.getInteger(offset);
149             int cpNext = stream.getInteger(offset);
150             int length = cpNext - cpPre -1;
151             if(fc == 0)
152             {
153                 length *= 2;
154             }
155             else
156             {
157                 start = start/2;
158             }
159           
160             start += 512;
161             bytesToString(ogiBytes, content, start, length, fc);
162       
163             System.out.println(start +", "+ length);
164         }
165       
166         FileUtil.writeAscFile("E:output.txt", content.toString(), false);
167       
168         System.out.println("Done!");
169       
170     }
171 }

[代码] FileUtil

view source
print ?
01 import java.io.FileInputStream;
02 import java.io.FileOutputStream;
03 import java.io.InputStreamReader;
04 import java.io.OutputStreamWriter;
05  
06 public class FileUtil {
07    
08     public static byte[] readBinFile(String path) throws Exception
09     {
10         FileInputStream stream = new FileInputStream(path);
11        
12         int len = stream.available();
13         byte[] buffer = new byte[len];
14         stream.read(buffer);
15         stream.close();
16        
17         return buffer;
18     }
19    
20     public static String readAscFile(String path) throws Exception
21     {
22         InputStreamReader reader = new InputStreamReader(new FileInputStream(path), "UTF-8");
23         StringBuilder sb = new StringBuilder();
24  
25         int ch = 0;
26         while((ch = reader.read()) != -1)
27         {
28             sb.append( (char)ch );
29         }
30         reader.close();
31        
32         return sb.toString();
33     }
34    
35     public synchronized static void writeBinFile(String path, byte[] buffer) throws Exception
36     {
37         FileOutputStream output = new FileOutputStream(path, true);    
38         output.write(buffer);
39         output.flush();
40         output.close();
41     }
42    
43     public synchronized static void writeAscFile(String path, String content) throws Exception
44     {
45         writeAscFile(path, content, true);
46     }
47    
48     public synchronized static void writeAscFile(String path, String content, boolean append) throws Exception
49     {
50         FileOutputStream output = new FileOutputStream(path, append);
51         OutputStreamWriter writer = new OutputStreamWriter(output, "UTF-8");
52        
53         writer.append(content);
54         writer.flush();
55         writer.close();
56     }
57 }

你可能感兴趣的:(使用Java基于数据流直接抽取word文本)