现在以https://www.mann-hummel.com/mf_prodkata_china/index.html?ktlg_page=1&ktlg_lang=16&ktlg_01_fzart=1为例
抓取这个站点的汽车信息
1、设置得到信息的的汽车对象类
package com.xiang; import java.util.List; public class CarInfo { // private String manufacturer; // private String serieliaze; // private String model; // private String enginCode; // private String kilowatt; // private String horsepower; // private String makeTime; List<String> car; public List<String> getCar() { return car; } public void setCar(List<String> car) { this.car = car; } }
2、设置目录的类(包括子目录与父目录的关系)
package com.xiang; import java.util.List; public class CategoryAnther { private String id; private String name; private List<CategoryAnther> categoryAnther; public String getId() { return id; } public void setId(String id) { this.id = id; } public String getName() { return name; } public void setName(String name) { this.name = name; } public List<CategoryAnther> getCategoryAnther() { return categoryAnther; } public void setCategoryAnther(List<CategoryAnther> categoryAnther) { this.categoryAnther = categoryAnther; } }
3、主程序抓取
package com.xiang; import java.io.BufferedReader; import java.io.File; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.net.URL; import java.net.URLConnection; import java.util.ArrayList; import java.util.Date; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.cyberneko.html.parsers.DOMParser; import org.w3c.dom.Document; import org.w3c.dom.Node; import org.xml.sax.InputSource; import org.htmlparser.NodeFilter; import org.htmlparser.Parser; import org.htmlparser.filters.HasAttributeFilter; import org.htmlparser.tags.TableRow; import org.htmlparser.tags.OptionTag; import org.htmlparser.tags.TableColumn; import org.htmlparser.tags.LinkTag; import org.htmlparser.util.NodeList; import org.htmlparser.util.ParserException; public class ExportInfo { /** * @param args **/ public static void main(String[] args) { System.out.println("main start-----------"+new Date()); // TODO Auto-generated method stub String url1 = "https://www.mann-hummel.com/mf_prodkata_china/index.html?ktlg_page=1&ktlg_lang=16&ktlg_01_fzart=1"; String url2 = "https://www.mann-hummel.com/mf_prodkata_china/index.html?ktlg_page=1&ktlg_lang=16&ktlg_01_fzart=2"; List<CategoryAnther> firstCategory = new ArrayList<CategoryAnther>(); // List<CategoryAnther> secondCategory = new ArrayList<CategoryAnther>(); firstCategory = addChildrenToList(url1); firstCategory.addAll(addChildrenToList(url2)); // secondCategory = addChildrenToList(url2); List<CarInfo> carInfo = new ArrayList<CarInfo>(); try{ File f = new File("liufen.txt"); if(!f.exists()) f.createNewFile(); FileWriter fw = new FileWriter(f,true); // readFileByLines("xiangqi.txt",fw); for(int i =0;i<firstCategory.size();i++){ CategoryAnther categoryAnther = firstCategory.get(i); List<CategoryAnther> childrenCategory = categoryAnther.getCategoryAnther(); for(int j=0;j<childrenCategory.size();j++){ String _url = url1+"&ktlg_01_mrksl="+categoryAnther.getId()+"&ktlg_01_mdrsl="+childrenCategory.get(j).getId(); // System.out.println(_url); //start analyze data by url carInfo.addAll(getDataByUrl(categoryAnther.getName(),childrenCategory.get(j).getName(),_url)); } } // for(int i =0;i<secondCategory.size();i++){ // CategoryAnther categoryAnther = secondCategory.get(i); // List<CategoryAnther> childrenCategory = categoryAnther.getCategoryAnther(); // for(int j=0;j<childrenCategory.size();j++){ // String _url = url2+"&ktlg_01_mrksl="+categoryAnther.getId()+"&ktlg_01_mdrsl="+childrenCategory.get(j).getId(); // //start analyze data by url // carInfo.addAll(getDataByUrl(categoryAnther.getName(),childrenCategory.get(j).getName(),_url)); // } // } fw.write("开始写入1------\r\n"); for(int k=0;k<carInfo.size();k++){ fw.write(carInfo.get(k).getCar().get(0)+"\r\n"); } fw.write("开始写入2------\r\n"); for(int k=0;k<carInfo.size();k++){ fw.write(carInfo.get(k).getCar().get(1).replace(" ", " ")+"\r\n"); } fw.write("开始写入3------\r\n"); for(int k=0;k<carInfo.size();k++){ fw.write(carInfo.get(k).getCar().get(2)+"\r\n"); } fw.write("开始写入4------\r\n"); for(int k=0;k<carInfo.size();k++){ fw.write(carInfo.get(k).getCar().get(3)+"\r\n"); } fw.write("开始写入5------\r\n"); for(int k=0;k<carInfo.size();k++){ fw.write(carInfo.get(k).getCar().get(4)+"\r\n"); } fw.write("开始写入6------\r\n"); for(int k=0;k<carInfo.size();k++){ fw.write(carInfo.get(k).getCar().get(5)+"\r\n"); } fw.write("开始写入7------\r\n"); for(int k=0;k<carInfo.size();k++){ fw.write(carInfo.get(k).getCar().get(6)+"\r\n"); } fw.flush(); fw.close(); }catch(Exception e){ e.printStackTrace(); } System.out.println("main end-----------"+new Date()); } public static String getHtmlByUrl(String url){ int layouttime = 20000; String html =""; try { URL b = new URL(url); URLConnection urlConnection = b.openConnection(); urlConnection.setReadTimeout(layouttime); InputStream inputStream = urlConnection.getInputStream(); BufferedReader in = new BufferedReader(new InputStreamReader(inputStream, "gb2312")); String rString = null; while ((rString = in.readLine()) != null) { html+=rString; } }catch(Exception e){ e.printStackTrace(); } return html; } public static List<CarInfo> getDataByUrl(String firstName,String secondName,String url){ System.out.println("getDataByUrl start-----------"+new Date()); List<CarInfo> carInfoList = new ArrayList<CarInfo>(); String html =""; html = getHtmlByUrl(url); Parser parser = Parser.createParser(html, "gb2312"); NodeFilter nameFilter = new HasAttributeFilter("id", "rahmen"); NodeList list = null; try { list = parser.extractAllNodesThatMatch(nameFilter); } catch (ParserException e) { // TODO Auto-generated catch block e.printStackTrace(); } // System.out.println("得到的行数的大小1:"+list.toHtml()); NodeList tablelist= list.elementAt(0).getChildren(); // System.out.println("得到的行数的大小2:"+tablelist.toHtml()); NodeList trlist= tablelist.elementAt(1).getChildren(); // System.out.println("得到的行数的大小3:"+trlist.toHtml()); for(int i =6;i<trlist.size();i=i+2){ List<String> trInfo = new ArrayList<String>(); trInfo.add(firstName); trInfo.add(secondName); TableRow tableRow = (TableRow) trlist.elementAt(i); NodeList tdlist = tableRow.getChildren(); for(int j =2;j<tdlist.size();j=j+3){ TableColumn tableColumn = (TableColumn) tdlist.elementAt(j); NodeList alist = tableColumn.getChildren(); LinkTag linkTag = null; if(j==2) linkTag = (LinkTag) alist.elementAt(1); else linkTag = (LinkTag) alist.elementAt(2); trInfo.add(linkTag.getLinkText()); // System.out.print(linkTag.getLinkText()+"--"); } CarInfo carInfo = new CarInfo(); carInfo.setCar(trInfo); System.out.println(trInfo.get(0)); carInfoList.add(carInfo); } System.out.println("getDataByUrl end-----------"+new Date()); return carInfoList; } public static List<CategoryAnther> addChildrenToList(String url){ System.out.println("addChildrenToList start-----------"+new Date()); List<CategoryAnther> firstCategrory = getFirstPageCategoryIds(url,"ktlg_01_mrksl"); for (int i = 0; i < firstCategrory.size(); i++) { String _url = url + "&ktlg_01_mrksl=" + firstCategrory.get(i).getId(); //对二级目录进行解析 firstCategrory.get(i).setCategoryAnther(getFirstPageCategoryIds(_url,"ktlg_01_mdrsl")); } System.out.println("addChildrenToList end-----------"+new Date()); return firstCategrory; } public static List<CategoryAnther> getFirstPageCategoryIds(String url,String nameValue) { System.out.println("getFirstPageCategoryIds start-----------"+new Date()); List<CategoryAnther> categorys = new ArrayList<CategoryAnther>(); String html =""; html = getHtmlByUrl(url); Parser parser = Parser.createParser(html, "gb2312"); NodeFilter nameFilter = new HasAttributeFilter("name", nameValue); NodeList list = null; try { list = parser.extractAllNodesThatMatch(nameFilter); } catch (ParserException e) { // TODO Auto-generated catch block e.printStackTrace(); } NodeList optionList= list.elementAt(0).getChildren(); for(int i =1;i<optionList.size();i++){ OptionTag option = (OptionTag) optionList.elementAt(i); CategoryAnther categoryAnther = new CategoryAnther(); // System.out.print(option.getAttribute("value")+"--"); // System.out.println(option.getChildrenHTML()); categoryAnther.setId(option.getAttribute("value")); categoryAnther.setName(option.getChildrenHTML()); categorys.add(categoryAnther); } System.out.println("getFirstPageCategoryIds end-----------"+new Date()); return categorys; } public static void readFileByLines(String fileName,FileWriter fw) { File file = new File(fileName); BufferedReader reader = null; try { System.out.println("以行为单位读取文件内容,一次读一整行:"); reader = new BufferedReader(new FileReader(file)); String tempString = null; int line = 1; // 一次读入一行,直到读入null为文件结束 while ((tempString = reader.readLine()) != null) { // 显示行号 if(tempString.trim().equals("")) fw.write(tempString+"\r\n"); else if(tempString.indexOf("-")>-1) fw.write(tempString+"\r\n"); else fw.write(tempString+"→"+"\r\n"); } reader.close(); } catch (IOException e) { e.printStackTrace(); } finally { if (reader != null) { try { reader.close(); } catch (IOException e1) { } } } } }
不懂得call 13886053422 或QQ 526151410
下面附有项目文件。由于版权所有,设有密码。请向本人索要密码