java爬取jd的所有图书类信息

maven 依赖

		
            org.jsoup
            jsoup
            1.11.3
        
        
            com.alibaba
            fastjson
            1.2.47
        
         
            org.apache.commons
            commons-lang3
            3.8.1
        

java代码 (先根据 BASE_FILE_PATH 建立 jd_book 文件夹)

然后运行下面代码就行了, sql 文件会在 jd_book 文件夹下生成,导入mysql 就行了, 价格查询的接口大概访问 几千次左右就 会被jd 禁止一段时间,
ip代理要花钱,就没弄, 这个价格的解决方案,可以用java读取数据的时候, 发起请求查询,这时候的查询量少,是没问题的,
当然也可以在查询价格的时候, 把线程休眠一段时间,减少请求的次数


import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.net.URL;
import java.util.*;
import java.util.stream.Collectors;

/**
 * @Author: panlf
 * @Date: 2019/9/27 11:43
 */
public class JDBook {
    private static String BASE_LIST_URL = "https://list.jd.com/";
    private static String BASE_FILE_PATH = "D:\\ideaWorkplace\\demo1\\src\\main\\resources\\jd_book\\sqlresult_%s.txt";
    private static String BASE_ERR_PATH = "D:\\ideaWorkplace\\demo1\\src\\main\\resources\\jd_book\\errorUrl.txt";
    private static String URL_TYPE =
            "list.html?cat=1713,3261,3359`/list.html?cat=1713,3258`/list.html?cat=1713,3259`/list.html?cat=1713,3260`/list.html?cat=1713,3261`" +
            "/list.html?cat=1713,12775`/list.html?cat=1713,12776`/list.html?cat=1713,13627`/list.html?cat=1713,13634`/list.html?cat=1713,3262`" +
                    "/list.html?cat=1713,3263`/list.html?cat=1713,3267`/list.html?cat=1713,3266`/list.html?cat=1713,3264`/list.html?cat=1713,3265`" +
                    "/list.html?cat=1713,13613`/list.html?cat=1713,3270`/list.html?cat=1713,3271`/list.html?cat=1713,9278`/list.html?cat=1713,9291`" +
                    "/list.html?cat=1713,9301`/list.html?cat=1713,9309`/list.html?cat=1713,9314`/list.html?cat=1713,3269`/list.html?cat=1713,3272`" +
                    "/list.html?cat=1713,3273`/list.html?cat=1713,3279`/list.html?cat=1713,3276`/list.html?cat=1713,3275`/list.html?cat=1713,3274`" +
                    "/list.html?cat=1713,3277`/list.html?cat=1713,3280`/list.html?cat=1713,3281`/list.html?cat=1713,3284`/list.html?cat=1713,3287`" +
                    "/list.html?cat=1713,3285`/list.html?cat=1713,9340`/list.html?cat=1713,9368`/list.html?cat=1713,3286`/list.html?cat=1713,9351`" +
                    "/list.html?cat=1713,3288`/list.html?cat=1713,3289`/list.html?cat=1713,3282`/list.html?cat=1713,11047`/list.html?cat=1713,3290`" +
                    "/list.html?cat=1713,3291`/list.html?cat=1713,3294`/list.html?cat=1713,4758`/list.html?cat=1713,4855`/list.html?cat=1713,6929`" +
                    "/list.html?cat=1713,14669`/list.html?cat=1713,3296`/list.html?cat=1713,11745";
   // private static String URL_TYPE="list.html?cat=1713,3261,3359";
    private static String PRICE_JD = "https://p.3.cn/prices/mgets?skuIds=J_";
    private static final Map HEADER =new HashMap<>();
    static {
        HEADER.put("Host", "http://p.3.cn");
        HEADER.put("User-Agent", "  Mozilla/5.0 (Windows NT 6.1; WOW64; rv:5.0) Gecko/20100101 Firefox/5.0");
        HEADER.put("Accept", "  text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
        HEADER.put("Accept-Language", "zh-cn,zh;q=0.5");
        HEADER.put("Accept-Charset", "  GB2312,utf-8;q=0.7,*;q=0.7");
        HEADER.put("Connection", "keep-alive");
    }

    public static void main(String[] args) throws Exception {
        List list = Arrays.asList(URL_TYPE.split("`"));
        BufferedWriter errOut = getBufferedOut(BASE_ERR_PATH);
        //开10个线程
        int n = list.size()/10;
        if(n==0)n=1;
        for (int i = 0; i < list.size(); i += n) {
            List temp = list.stream().skip(i).limit(n).collect(Collectors.toList());
            startThread(getBufferedOut(String.format(BASE_FILE_PATH, i / n)), errOut, temp, String.format("第%d条线程", i / n));
        }

    }

    private static void searchJD(BufferedWriter out, BufferedWriter outErr, List list) throws Exception {
        int index = 0;
        for (String one : list) {
            String bookUrl = BASE_LIST_URL + one;
            Document indexPage = Jsoup.parse(new URL(bookUrl), 5000);
            String formatUrl = BASE_LIST_URL + indexPage.select(".p-num").get(0).child(1).attr("href").replaceAll("\\&page=[0123456789]*\\&", "&page=%d&");
            int maxPage = Integer.parseInt(indexPage.select(".p-skip").get(0).child(0).select("b").html());
            //遍历每一页
            for (int i = 1; i <= maxPage; i++) {
                String pageUrl = String.format(formatUrl, i);
                Document pageDetail = Jsoup.parse(new URL(pageUrl), 5000);
                List bookDetailList = pageDetail.select("#plist").get(0).select(".gl-item").stream().map(x -> "https:" + x.select(".p-name").get(0).child(0).attr("href")).collect(Collectors.toList());
                //遍历每一条
                for (String url : bookDetailList) {
                    try {
                        Map fieldMap = getFieldMap();
                        List paramList = new ArrayList<>();
                        List valueList = new ArrayList<>();

                        paramList.add("id");
                        String uuid = UUID.randomUUID().toString().replaceAll("-","");
                             //   UuidUtils.generate();
                        valueList.add(uuid);

                        Document detail = Jsoup.parse(new URL(url), 50000);

                        Element e1 = detail.select(".crumb.fl.clearfix").get(0);
                        //书名
                        String bookName = e1.select(".item.ellipsis").get(0).html();
                        paramList.add("book_name");
                        valueList.add(bookName);

                        //分类
                        String bookType = e1.select("a").stream().map(x -> x.html()).reduce((x, y) -> x + ">" + y).get();
                        paramList.add("book_type");
                        valueList.add(bookType);

                        //作者
                        String bookAuthor = null;
                        try {
                            bookAuthor = detail.select("#p-author").get(0).child(0).html();
                        } catch (Exception e) {
                            bookAuthor = "无作者信息";
                        }
                        paramList.add("author");
                        valueList.add(bookAuthor);

                        Element e2 = detail.select("#parameter2").get(0);
                        Map detailMap = e2.children().stream().collect(Collectors.toMap(x -> x.html(), x -> x.attr("title")));
                        String bookOrder = "";
                        for (String key : detailMap.keySet()) {
                            if (key.contains("出版社:")) {
                                paramList.add(fieldMap.get("出版社"));
                                valueList.add(detailMap.get(key));
                            } else if (key.contains("ISBN")) {
                                paramList.add(fieldMap.get("ISBN"));
                                valueList.add(detailMap.get(key));
                            } else if (key.contains("出版时间")) {
                                paramList.add(fieldMap.get("出版时间"));
                                valueList.add(detailMap.get(key));
                            } else if (key.contains("页数")) {
                                paramList.add(fieldMap.get("页数"));
                                valueList.add(detailMap.get(key));
                            } else if (key.contains("商品编码")) {
                                bookOrder = detailMap.get(key);
                                paramList.add("jd_code");
                                valueList.add(bookOrder);
                            }
                        }
                        //定价,先找到商品编号   id  :  url
                        String price=getPrice(bookOrder);
                        if(StringUtils.isNotBlank(price)){
                            paramList.add("price");
                            valueList.add(price);
                        }
                        paramList.add("detail_url");
                        valueList.add(url);
                        String sql = String.format("insert into jd_book_info (%s) values ('%s');", StringUtils.join(paramList, ","), StringUtils.join(valueList, "','"));
                        out.write(sql + "\r\n");
                        System.out.println(Thread.currentThread().getName() + " 第" + (++index) + "条");
                        out.flush();
                    } catch (Exception e) {
                        outErr.write("错误url: " + url+"\r\n");
                        e.printStackTrace();
                    }
                }
                outErr.flush();
            }
        }
        out.close();
    }

    private static String getPrice(String bookOrder) {
        String body="";
        try {
            Connection connect = Jsoup.connect(PRICE_JD+bookOrder).ignoreContentType(true).headers(HEADER).timeout(50000);
            Connection.Response execute = connect.execute();
            body = execute.body();
            List list = JSONArray.parseArray(body, JSONObject.class);
            return list.get(0).get("m").toString();
        } catch (Exception e) {
            System.out.println(e.getMessage()+"url: "+ PRICE_JD+bookOrder);
            return null;
        }
    }

    private static Map getFieldMap() {
        Map fieldMap = new HashMap<>();
        fieldMap.put("出版社", "publisher");
        fieldMap.put("ISBN", "isbn");
        fieldMap.put("出版时间", "book_time");
        fieldMap.put("页数", "page_num");
        return fieldMap;
    }


    private static void startThread(BufferedWriter out, BufferedWriter outErr, List typeUrl, String threadName) {
        new Thread(() -> {
            try {
                searchJD(out, outErr, typeUrl);
            } catch (Exception e) {
                e.printStackTrace();
            }
        }, threadName).start();
    }

    private static BufferedWriter getBufferedOut(String path) throws IOException {
        File writeName = new File(path); // 相对路径,如果没有则要建立一个新的output.txt文件
        writeName.createNewFile();
        FileWriter writer = new FileWriter(writeName, true);
        return new BufferedWriter(writer);
    }
}

你可能感兴趣的:(java,爬虫,java,爬虫,图书,京东)