package com.test; import java.io.File; import java.io.FileWriter; import java.net.URL; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; /** *@Author:liangjilong *@Date:2014-8-27 *@Email:[email protected] * *@Descript:此程序是抓取中国合格评定国家认可委员会---获准认可机构 */ public class TestReptile { /** * @param args */ public static void main(String[] args) throws Exception { String path = "D:/test/test.txt"; File file = new File(path); if (!file.exists()) { file.createNewFile();// 不存在就创建一个.. String newFlie = file.getPath(); FileWriter fileWriter = new FileWriter(newFlie); int page = 150;// 抓取页数 for (int i = 1; i <= page; i++) { String URL = getUrl(i);// 抓取第一页的内容 System.out.println(URL); Document doc = Jsoup.parse(new URL(URL), 3000); if (doc != null) { Elements divtables = doc.getElementsByAttributeValue("class", "divtable");// 获取div样式class=divtable里面的html内容 for (Element d : divtables) { String a_text = d.select("a").html();// 获取html里面a标签的内容 //System.out.println(a_text); fileWriter.write(a_text); fileWriter.flush(); } } else { System.out.println("网络异常.."); } } fileWriter.close(); } else { System.err.println("文件存在.."); } } /** * @param pageSize页数. * @return */ public static String getUrl(Integer pageSize){ String url="http://219.238.178.49/"; StringBuffer buffer=new StringBuffer(url); buffer.append("Acc_Search2.asp?Class=L&page="+pageSize); return buffer.toString(); } }
import java.io.File; import java.io.FileWriter; import java.io.IOException; import java.util.regex.Pattern; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; /** *@Author:liangjilong *@Date:2014-9-9 */ public class Test2 { private static final String regEx_html = "<[^>]+>"; // 定义HTML标签的正则表达式 /** * @param args */ public static void main(String[] args)throws Exception { Integer pageSize=20; getHtml(pageSize); } /** * @param pageSize * @throws IOException */ private static void getHtml(Integer pageSize) throws IOException { Pattern p_html = Pattern.compile(regEx_html, Pattern.CASE_INSENSITIVE); for (int i = 1; i <= pageSize; i++) { String url=getUrl(i); Document doc=Jsoup.connect(url).get(); if(doc!=null){ String fileName=doc.getElementsByAttributeValue("class", "T1").html();//抓取class=T1的内容,作为文件的名称. String path = "D:/test/"+fileName+i+".txt";//路径名/i File file = new File(path); FileWriter fileWriter=null; if (!file.exists()) { file.createNewFile();// 不存在就创建一个. String newFlie = file.getPath(); String htmlEl=doc.getElementsByAttributeValue("class", "clabel").html(); String htmlStr=p_html.matcher(htmlEl).replaceAll("").replaceAll(""", "");// 过滤html标签 fileWriter = new FileWriter(newFlie); fileWriter.write(htmlStr); fileWriter.flush(); } fileWriter.close(); }else{ System.err.println("网络异常!"); } } System.out.println("抓取完成~.."); } /** * @param pageSize页数. */ public static String getUrl(Integer pageSize){ String url="http://219.238.178.49/"; StringBuffer bufferUrl=new StringBuffer(url); bufferUrl.append("BaseInfo.asp?Id="); if(pageSize<=10){ if(pageSize==10){ bufferUrl.append("L000"+pageSize); }else{ bufferUrl.append("L0000"+pageSize); } }else if(pageSize<=100){ if(pageSize==100){ bufferUrl.append("L00"+pageSize); }else{ bufferUrl.append("L000"+pageSize); } }else if(pageSize<=1000){ if(pageSize==1000){ bufferUrl.append("L0"+pageSize); }else{ bufferUrl.append("L00"+pageSize); } } return bufferUrl.toString(); } }