HtmlParase解析html文件

自己整理的htmlparse,分享给大家。

Java代码

  1. package epson;  
  2. import java.io.BufferedReader;  
  3. import java.io.File;  
  4. import java.io.FileNotFoundException;  
  5. import java.io.FileReader;  
  6. import java.io.IOException;  
  7.  
  8. import org.htmlparser.Node;  
  9. import org.htmlparser.NodeFilter;  
  10. import org.htmlparser.Parser;  
  11. import org.htmlparser.Tag;  
  12. import org.htmlparser.filters.AndFilter;  
  13. import org.htmlparser.filters.HasAttributeFilter;  
  14. import org.htmlparser.filters.NodeClassFilter;  
  15. import org.htmlparser.filters.OrFilter;  
  16. import org.htmlparser.filters.TagNameFilter;  
  17. import org.htmlparser.tags.BodyTag;  
  18. import org.htmlparser.tags.HeadTag;  
  19. import org.htmlparser.tags.ImageTag;  
  20. import org.htmlparser.tags.MetaTag;  
  21. import org.htmlparser.tags.TableColumn;  
  22. import org.htmlparser.tags.TableRow;  
  23. import org.htmlparser.tags.TableTag;  
  24. import org.htmlparser.tags.TitleTag;  
  25. import org.htmlparser.util.NodeList;  
  26. import org.htmlparser.util.ParserException;  
  27. import org.htmlparser.util.SimpleNodeIterator;  
  28. import org.htmlparser.visitors.TextExtractingVisitor;  
  29.  
  30.  
  31. public class HtmlAnalysis {  
  32. /**
  33.      * @param args
  34.      */
  35. private String metaDataString;  
  36. private String title;  
  37. private String charset;  
  38. private String contentType;  
  39. private String content;  
  40. private String link;  
  41.  
  42.  
  43. private String localPath ;  
  44. private Parser parser = null;  
  45. private String htmlsource=null;  
  46.  
  47. public static final String META_KEYWORDS="keywords";  
  48. public static final String META_AUTHOR="author";  
  49. public static final String META_DESCRIPTION="description";  
  50. public static final String META_HTTP_EQUIV="http-equiv";  
  51.  
  52. public HtmlAnalysis(String htmlsource){  
  53. this.htmlsource = htmlsource;   
  54.     }  
  55.  
  56. public HtmlAnalysis(File htmlsource){  
  57.  
  58. try{  
  59.         String resource = this.getContentByLocalFile(htmlsource);  
  60. this.htmlsource = resource;  
  61.         }catch(Exception e){  
  62.  
  63.         }  
  64.     }  
  65.  
  66. public void init() throws Exception{  
  67. try{  
  68.         parser = new Parser(this.htmlsource);  
  69.         }catch(Exception e){  
  70. throw e;  
  71.         }  
  72.     }  
  73.  
  74.  
  75. public String getMetaKeywords(){  
  76.         String metaKeywords = "";  
  77.  
  78. try {  
  79.             NodeFilter nt = new NodeClassFilter(MetaTag.class) ;  
  80.             NodeList nodeList = parser.parse(nt);  
  81. for (int i = 0 ; i< nodeList.size(); i++) {  
  82.                 MetaTag mt =(MetaTag) nodeList.elementAt(i) ;  
  83.                 String cont  = mt.getAttribute("name") ;  
  84.  
  85. if (cont!=null && cont.equalsIgnoreCase("Keywords")) {  
  86.                     metaKeywords = mt.getAttribute("content");  
  87. break;  
  88.                 }  
  89.             }  
  90.         } catch (ParserException e) {  
  91.             e.printStackTrace();  
  92.         }  
  93. return metaKeywords;  
  94.     }  
  95.  
  96. public String getTitle() {  
  97.         String title="";  
  98.  
  99. try {  
  100.             NodeFilter nt = new NodeClassFilter(TitleTag.class) ;  
  101.             NodeList nodeList = parser.parse(nt);  
  102. for (int i = 0 ; i&lt; nodeList.size(); i++) {  
  103.                 TitleTag titlenode = (TitleTag) nodeList.elementAt(i) ;  
  104.                 title = titlenode.getTitle();  
  105. break;  
  106.             }      
  107.         } catch (ParserException e) {  
  108.             e.printStackTrace();  
  109.         }  
  110.  
  111. return title;  
  112.     }  
  113.  
  114. public String getBody() {  
  115.         String body="";  
  116.  
  117. try {  
  118.         NodeFilter nt = new NodeClassFilter(BodyTag.class) ;  
  119.         NodeList nodeList = parser.parse(nt);  
  120. for (int i = 0 ; i&lt; nodeList.size(); i++) {  
  121.                    BodyTag bodynode = (BodyTag) nodeList.elementAt(i) ;  
  122.                    body = bodynode.getChildrenHTML();  
  123. break;  
  124.         }      
  125.         } catch (ParserException e) {  
  126.             e.printStackTrace();  
  127.         }  
  128.  
  129. return body;  
  130.     }  
  131.  
  132. public String getBodyOnload() {  
  133.         String bodyonload="";     
  134. try {  
  135.             NodeFilter nt = new NodeClassFilter(BodyTag.class) ;  
  136.             NodeList nodeList = parser.parse(nt);  
  137. for (int i = 0 ; i&lt; nodeList.size(); i++) {  
  138.                 BodyTag bodynode = (BodyTag) nodeList.elementAt(i) ;  
  139.                 bodyonload = bodynode.getAttribute("onload");  
  140. break;  
  141.             }    
  142.  
  143.         } catch (ParserException e) {  
  144.             e.printStackTrace();  
  145.         }  
  146.  
  147. return bodyonload;  
  148.     }      
  149.  
  150. public String getHeadInfo() {  
  151.         String head="";  
  152.  
  153. try {  
  154.             NodeFilter nt = new NodeClassFilter(HeadTag.class) ;  
  155.             NodeList nodeList = parser.parse(nt);  
  156.  
  157.             HeadTag headnode = null;  
  158. for (int i = 0 ; i&lt; nodeList.size(); i++) {  
  159.                 headnode = (HeadTag) nodeList.elementAt(i) ;  
  160. break;  
  161.             }    
  162.  
  163.  
  164. if(headnode !=null){  
  165.                 SimpleNodeIterator tag = headnode.children();  
  166. int i=0;  
  167. while(tag.hasMoreNodes()){  
  168.                     Node node =tag.nextNode();  
  169. if((node instanceof MetaTag) || node instanceof TitleTag){  
  170.                         headnode.removeChild(i);  
  171.                     }  
  172.  
  173.                     i++;  
  174.                 }  
  175.             }  
  176.  
  177.             head = headnode.getChildrenHTML();  
  178.  
  179.  
  180.         } catch (ParserException e) {  
  181.             e.printStackTrace();  
  182.         }  
  183.  
  184. return head;  
  185.     }   
  186.  
  187.  
  188. public String getMetaInfo(String keytype){  
  189.         String metaInfo = "";  
  190.  
  191. try {  
  192.  
  193.             NodeFilter nt = new NodeClassFilter(MetaTag.class) ;  
  194.             NodeList nodeList = parser.parse(nt);  
  195.  
  196. if(META_KEYWORDS.equalsIgnoreCase(keytype)  
  197.                 ||  
  198.                 META_AUTHOR.equalsIgnoreCase(keytype)  
  199.                 ||  
  200.                 META_DESCRIPTION.equalsIgnoreCase(keytype))  
  201.             {  
  202.  
  203. for (int i = 0 ; i&lt; nodeList.size(); i++) {  
  204.                     MetaTag mt =(MetaTag) nodeList.elementAt(i) ;  
  205.                     String cont  = mt.getAttribute("name") ;  
  206.  
  207. if (cont!=null && cont.equalsIgnoreCase(keytype)) {  
  208.                         metaInfo = mt.getAttribute("content");  
  209. break;  
  210.                     }  
  211.                 }  
  212.             }else if(META_HTTP_EQUIV.equals(keytype)){  
  213. for (int i = 0 ; i&lt; nodeList.size(); i++) {  
  214.                     MetaTag mt =(MetaTag) nodeList.elementAt(i) ;  
  215.                     String cont  = mt.getAttribute("http-equiv") ;  
  216.  
  217. if (cont!=null && cont.equalsIgnoreCase(keytype)) {  
  218.                         metaInfo = mt.getAttribute("content");  
  219. break;  
  220.                     }  
  221.                 }  
  222.             }else{  
  223. for (int i = 0 ; i&lt; nodeList.size(); i++) {  
  224.                     MetaTag mt =(MetaTag) nodeList.elementAt(i) ;  
  225.                     String cont  = mt.getAttribute("name") ;  
  226.  
  227. if (cont!=null) {  
  228.  
  229. if(META_KEYWORDS.equalsIgnoreCase(cont)  
  230.                                 ||  
  231.                                 META_AUTHOR.equalsIgnoreCase(cont)  
  232.                                 ||  
  233.                                 META_DESCRIPTION.equalsIgnoreCase(cont)){  
  234.  
  235. //
  236.                         }else{  
  237.                             String tempmetaInfo = mt.getAttribute("content");  
  238.                             metaInfo +="&lt;"+cont+">"+tempmetaInfo+"</"+cont+">";  
  239.                         }  
  240.  
  241.  
  242.                     }  
  243.                 }  
  244.  
  245.             }  
  246.  
  247.  
  248.         } catch (ParserException e) {  
  249.             e.printStackTrace();  
  250.         }  
  251. return metaInfo;  
  252.     }  
  253.  
  254.  
  255. public String  getContentByLocalFile (File path) throws IOException {  
  256.         StringBuffer sbStr = new StringBuffer();  
  257.         BufferedReader reader = null ;  
  258.         String result = null ;  
  259. try {  
  260.             reader = new BufferedReader(new FileReader(path));  
  261.         } catch (FileNotFoundException e) {  
  262.             e.printStackTrace();  
  263.         }  
  264.         String temp = "";  
  265. while((temp=reader.readLine())!=null)  
  266.           {  
  267.            sbStr.append(temp);  
  268.            sbStr.append("\r\n");  
  269.           }  
  270.           reader.close();  
  271.           result = sbStr.toString();  
  272. return result ;  
  273.     }  
  274.  
  275.  
  276. public String getContentByUrl(String url){  
  277. return null ;  
  278.     }  
  279.  
  280. public void getmetaDataByVistor() {  
  281.     }  
  282.  
  283. public String getURLContent(String Url) {  
  284.         Parser parser = null;  
  285.  
  286. try {  
  287.             parser = new Parser(Url);  
  288.             String a="";  
  289.             parser = new Parser(a);  
  290.             TextExtractingVisitor visitor = new TextExtractingVisitor();  
  291.             parser.visitAllNodesWith(visitor);  
  292.             content = visitor.getExtractedText();  
  293.         } catch (ParserException e1) {  
  294.             e1.printStackTrace();  
  295.         }  
  296.  
  297. return content;  
  298.     }  
  299. public NodeList getDiv(){  
  300.           NodeList nodelist=null;  
  301.           NodeFilter[] nodeFilter=new NodeFilter[2];  
  302. try{  
  303.            parser.setEncoding("GB2312");//set encode
  304.            TagNameFilter divFilter=new TagNameFilter("div");//get the table content
  305.            HasAttributeFilter divAttribute=new HasAttributeFilter("id","Cont_13");//hava the attribute "bgcolor"
  306.            nodeFilter[0]=divFilter;  
  307.            nodeFilter[1]=divAttribute;  
  308.            AndFilter andFilter=new AndFilter(nodeFilter);//to link the three filter that above together
  309.            nodelist=parser.extractAllNodesThatMatch(andFilter);//get the result that fit for the filter
  310.           }catch(Exception e){  
  311.            e.printStackTrace();  
  312.           }  
  313. return nodelist;  
  314.     }  
  315. public NodeList getTable() throws ParserException{  
  316.         NodeList nodelist=null;  
  317.         String dd = getDiv().toHtml();  
  318.         Parser parser2 = new Parser(dd);  
  319.         TagNameFilter tableFilter=new TagNameFilter("table");  
  320.         nodelist = parser2.extractAllNodesThatMatch(tableFilter);  
  321.         String htmlresult ="";  
  322. for (int i = 0; i <= nodelist.size(); i++) {  
  323. if (nodelist.elementAt(i) instanceof TableTag) {  
  324.                 TableTag tag = (TableTag) nodelist.elementAt(i);  
  325.                 TableRow[] rows = tag.getRows();  
  326.  
  327. for (int j = 0; j &lt; rows.length; j++) {  
  328.                     TableRow tr = (TableRow) rows[j];  
  329.                     TableColumn[] td = tr.getColumns();  
  330. for (int k = 0; k &lt; td.length; k++) {  
  331.                         String result = td[k].toPlainTextString().trim().replace("\t", "");  
  332. if(k==0){  
  333.                             htmlresult += "&lt;title>"+result+"</title>";  
  334.                         }  
  335. else
  336.                             htmlresult += "<id>"+result+"</id>";  
  337.                     }  
  338.                 }  
  339.             }  
  340.         }  
  341.         System.out.println(htmlresult);  
  342. return nodelist;  
  343.     }  
  344. public void testTable() {  
  345. //        Parser myParser;
  346.         NodeList nodeList = null;  
  347. //        myParser = Parser.createParser("<body> " + "<table id=’table1′ >"
  348. //                + "<tr><td>1-11</td><td>1-12</td><td>1-13</td>"
  349. //                + "<tr><td>1-21</td><td>1-22</td><td>1-23</td>"
  350. //                + "<tr><td>1-31</td><td>1-32</td><td>1-33</td></table>"
  351. //                + "<table id=’table2′ >"
  352. //                + "<tr><td>2-11</td><td>2-12</td><td>2-13</td>"
  353. //                + "<tr><td>2-21</td><td>2-22</td><td>2-23</td>"
  354. //                + "<tr><td>2-31</td><td>2-32</td><td>2-33</td></table>"
  355. //                + "</body>", "GBK");
  356.         NodeFilter tableFilter = new NodeClassFilter(TableTag.class);  
  357.         OrFilter lastFilter = new OrFilter();  
  358.         lastFilter.setPredicates(new NodeFilter[] { tableFilter });  
  359. try {  
  360.             nodeList = parser.parse(lastFilter);  
  361. for (int i = 0; i <= nodeList.size(); i++) {  
  362. if (nodeList.elementAt(i) instanceof TableTag) {  
  363.                     TableTag tag = (TableTag) nodeList.elementAt(i);  
  364.                     TableRow[] rows = tag.getRows();  
  365.  
  366. for (int j = 0; j &lt; rows.length; j++) {  
  367.                         TableRow tr = (TableRow) rows[j];  
  368.                         TableColumn[] td = tr.getColumns();  
  369. for (int k = 0; k &lt; td.length; k++) {  
  370.                             System.out.println("&lt;td>" + td[k].toPlainTextString());  
  371.                         }  
  372.  
  373.                     }  
  374.  
  375.                 }  
  376.             }  
  377.  
  378.         } catch (ParserException e) {  
  379.             e.printStackTrace();  
  380.         }  
  381.     }  
  382. public String getImg() {  
  383.         String img="";  
  384.         ImageTag imgnode=null;  
  385.         File file = new File("e:\\test\\jsp\\jsp\\test1.htm");  
  386.         String imgRealPath="";  
  387. if(file.exists())  
  388.         {    
  389.             file.delete();    
  390. try
  391.             {  
  392.                 file.createNewFile();  
  393.             } catch (IOException e)   
  394.             {  
  395.                 e.printStackTrace();  
  396.             }    
  397.         }else{    
  398. try
  399.               {  
  400.                 file.createNewFile();  
  401.               } catch (IOException e) {  
  402.                 e.printStackTrace();  
  403. //                   TODO Auto-generated catch block
  404.               }    
  405.         }     
  406. try {  
  407.             NodeFilter nt = new NodeClassFilter(ImageTag.class) ;  
  408.  
  409. //BufferedWriter writer = new BufferedWriter(new OutputStreamWriter (new FileOutputStream (file)));
  410.             NodeList nodeList = parser.parse(nt);  
  411.  
  412. for (int i = 0 ; i< nodeList.size(); i++){  
  413. int num=0;  
  414.                 imgnode = (ImageTag)nodeList.elementAt(i);  
  415.                 img = imgnode.getImageURL();  
  416.                 System.out.println(img);  
  417. /* String[] filePath = file.getParent().split("\\\\");
  418.                 String[] imgPath = img.split("/");
  419.                 System.out.println(img+"  "+file.getParent());
  420.                 for(int j=0;j&lt;imgPath.length;j++)
  421.                 {
  422.                     if(imgPath[j].equals(".."))
  423.                     {   
  424.                         num++;
  425.                     }
  426.                 }
  427.                 System.out.println(img.indexOf(":")+"img.indexOf(:)"+img);
  428.                 if(img.indexOf(":")!=-1)
  429.                 {
  430.                     imgRealPath=img;
  431.                 }
  432.                 else if(num>1)
  433.                 {
  434.                     System.out.println("img before replace"+img);
  435.                     img = img.replace("../","");
  436.                     System.out.println("img num&gt;1"+img+num);
  437.                     imgRealPath = filePath[filePath.length-1-num]+"/"+img;
  438.                     while((filePath.length-1-num)&gt;0)
  439.                     {
  440.                         num++;
  441.                         imgRealPath = filePath[filePath.length-1-num]+imgRealPath;
  442.                     }
  443.                     System.out.println("imgRealPath"+imgRealPath+(filePath.length-1-num));
  444.                 }
  445.                 else if(imgPath[0].equals("."))
  446.                 {
  447.                     System.out.println(file.getParent()+"imgPath[0].equals(.)");
  448.                     img = img.replace("./","");
  449.                     imgRealPath=file.getParent()+"\\"+img;
  450.                 }
  451.                 else
  452.                 {
  453.                     for(int j=0;j&lt;imgPath.length;j++)
  454.                     {
  455.                         if(imgPath[j].equals(".."))
  456.                         {
  457.                             imgPath[j] = (String)( imgPath[j].replace("..",filePath[j+1]));
  458.                             System.out.println(imgPath[j]);
  459.                         }
  460.                         if(!imgPath[j].equals(""))
  461.                             imgRealPath += "/"+imgPath[j];
  462.                     }
  463.                     imgRealPath=filePath[0]+imgRealPath;
  464.                 }
  465.                 imgRealPath = imgRealPath.replaceAll("\\\\","/");
  466.                 imgnode.setImageURL(imgRealPath);
  467.                 imgRealPath="";
  468.                 writer.write(imgnode.toHtml()); */
  469.             }    
  470. //writer.flush();
  471. // writer.close ();  
  472.         } catch (Exception e) {  
  473.             e.printStackTrace();  
  474.         }  
  475. return imgRealPath;  
  476.     }  
  477.  
  478. public static void main(String[] args) {  
  479.         HtmlAnalysis htmlAnalysis= new HtmlAnalysis(new File("f:\\test.html"));   
  480. try{  
  481.             htmlAnalysis.init();  
  482. //          System.out.println(htmlAnalysis.getMetaInfo("keywords"));
  483. //          htmlAnalysis.parser.reset();
  484. //          System.out.println(htmlAnalysis.getMetaInfo("author"));
  485. //          htmlAnalysis.parser.reset();
  486. //          System.out.println(htmlAnalysis.getMetaInfo("description"));
  487. //          htmlAnalysis.parser.reset();
  488. //          System.out.println(htmlAnalysis.getMetaInfo("other"));
  489. //          htmlAnalysis.parser.reset();
  490. //System.out.println(htmlAnalysis.getTitle());
  491. //htmlAnalysis.parser.reset();
  492. //System.out.println(htmlAnalysis.getHeadInfo());
  493.             htmlAnalysis.getTable();  
  494. //          htmlAnalysis.testTable();
  495.         }catch(Exception e){  
  496.  
  497.         }  
  498.  
  499.     }  
  500.  
  501. public static void visitTag(Tag tag) {  
  502. if (tag.getAttribute("class") != null) {  
  503.             System.out.println(" " + tag.getTagName() +  
  504.                 tag.getAttribute("class"));  
  505.         }  
  506.     }  
  507.  
  508.  
  509.  
  510. public String getCharset() {  
  511. return charset;  
  512.     }  
  513.  
  514. public void setCharset(String charset) {  
  515. this.charset = charset;  
  516.     }  
  517.  
  518. public String getContentType() {  
  519. return contentType;  
  520.     }  
  521.  
  522. public void setContentType(String contentType) {  
  523. this.contentType = contentType;  
  524.     }  
  525.  
  526. public String getMetaDataString() {  
  527. return metaDataString;  
  528.     }  
  529.  
  530. public void setMetaDataString(String metaDataString) {  
  531. this.metaDataString = metaDataString;  
  532.     }  
  533.  
  534.  
  535.  
  536. public void setTitle(String title) {  
  537. this.title = title;  
  538.     }  
  539.  
  540. public String getContent() {  
  541. return content;  
  542.     }  
  543.  
  544. public void setContent(String content) {  
  545. this.content = content;  
  546.     }  
  547. }

你可能感兴趣的:(职场,休闲,HtmlParase,解析html文件)