Java爬虫|爬虫爬jj榜单数据写入excel

大学的时候选python课 课设就是让我们用爬虫去爬取数据 写入文件 然后再做数据分析 词云图 地图分类等 python已经记不清了 现在用Java尝试一下爬取数据

爬虫分为三步骤:1.获取你自己电脑访问网站的时候的请求头 2.目标网站的url 3.对爬出来的网页返回值进行切分出有用的部分

 

package com.example.concurrent;

import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.ss.usermodel.Sheet;
import org.apache.poi.ss.usermodel.Workbook;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.FileOutputStream;
import java.io.IOException;

import java.util.ArrayList;
import java.util.List;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;

public class BookLibrary {
    // 目标榜单URL 序号 作者 作品 类型 进度 字数 作品积分
    private static final String TARGET_URL = "https://www.jjwxc.net/topten.php?orderstr=7&t=1";

    public static void main(String[] args) {
        List novels = crawlRankList();
//        novels.forEach(System.out::println);
        writeNovelsToExcel(novels,"/Documents/novels.xlsx");
    }



    public static void writeNovelsToExcel(List novelList, String outputPath) {
        try (Workbook workbook = new XSSFWorkbook()) { // 创建.xlsx格式工作簿
            Sheet sheet = workbook.createSheet("小说列表"); // 创建工作表

            // 创建表头行
            String[] headers = {"排名", "作者", "书名", "类型", "进度", "总字数", "投票数", "更新时间", "简介"};
            Row headerRow = sheet.createRow(0);
            for (int i = 0; i < headers.length; i++) {
                Cell cell = headerRow.createCell(i);
                cell.setCellValue(headers[i]);
            }
            // 填充数据行
            int rowNum = 1;
            for (Novel novel : novelList) {
                Row row = sheet.createRow(rowNum++);
                // 按字段顺序写入(需与Novel类构造参数顺序一致)
                row.createCell(0).setCellValue(novel.getRank());
                row.createCell(1).setCellValue(novel.getAuthor());
                row.createCell(2).setCellValue(novel.getTitle());
                row.createCell(3).setCellValue(novel.getType());
                row.createCell(4).setCellValue(novel.getProgress());
                row.createCell(5).setCellValue(novel.getTotalNUm());
                row.createCell(6).setCellValue(novel.getVotes());
                row.createCell(7).setCellValue(novel.getTime());
                row.createCell(8).setCellValue(novel.getJianjie());
            }

            // 自动调整列宽(可选)
            for (int i = 0; i < headers.length; i++) {
                sheet.autoSizeColumn(i);
            }

            // 写入文件
            try (FileOutputStream outputStream = new FileOutputStream(outputPath)) {
                workbook.write(outputStream);
                System.out.println("Excel文件已生成:" + outputPath);
            }
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
    }

    /**
     * 爬取榜单数据
     */
    public static List crawlRankList() {
        List novelList = new ArrayList<>();
        try {
            // 1. 模拟浏览器请求(关键反爬策略)- useragent放入你自己的请求头
            Document doc = Jsoup.connect(TARGET_URL)
                    .userAgent("Mozilla/5.0 Version/17.4 Safari")
                    .header("Accept-Language", "zh-CN,zh;q=0.9")
                    .timeout(10_000)
                    .get();
            String htmlContent = doc.html(); // 获取完整HTML内容(含格式)
//            System.out.println(htmlContent);
            // 2. 定位榜单表格(需根据实际HTML结构调整选择器)
//            Element table = doc.selectFirst("table.rank-table");
            Element targetTable = null;
            Elements tables = doc.select("table[width=984][border=0][align=center][cellpadding=0][cellspacing=1][bgcolor=#009900]");
//            System.out.println("tables.size>>>" + tables.size());
            for (Element table : tables) {
                Element firstTd = table.selectFirst("td:eq(0)"); // 第一个td
                if (firstTd != null && firstTd.text().trim().equals("序号")) {
//                    System.out.println("找到目标表格:\n" + table);
                    targetTable = table;
                    break;
                }
            }
            if (targetTable == null) {
                return new ArrayList<>();
            }
            Elements rows = targetTable.select("tr:has(td)"); // 跳过表头

            // 3. 解析每一行数据 序号 作者 作品 类型 进度 字数 作品积分 截止时间
            for (int i = 1; i < rows.size(); i++) {
                Elements cols = rows.get(i).select("td");
                if (cols.size() < 8) continue;
//                System.out.println(cols.html() + ">>>>>>");
                String rank = cols.get(0).text();
                Element authorTd = cols.get(1);
                String author = "";

                Element links = authorTd.selectFirst("a");
                if (links != null) {
                    author = links.text().trim();
                } else {
                    System.out.println("未找到标签");
                }
                Element authorTdss = cols.get(2);
                String bookName = "";
                String rawRel = "";

                Element link = authorTdss.selectFirst("a");
                if (link != null) {
                    // 提取书名
                    bookName = link.text().trim();
                    // 提取并处理rel属性
                    rawRel = link.attr("rel").replaceAll("
", "\n"); // 输出结果 // System.out.println("书名: " + bookName); // System.out.println("简介:\n" + rawRel); } else { System.out.println("未找到
标签"); } String type = cols.get(3).text(); String progress = cols.get(4).text(); String totalNUm = cols.get(5).text(); String votes = cols.get(6).text(); String time = cols.get(7).text(); novelList.add(new Novel(rank, author, bookName, type, progress, totalNUm, votes, time, rawRel)); } // 4. 添加延迟防止封IP Thread.sleep(8000); } catch (IOException | InterruptedException e) { e.printStackTrace(); } return novelList; } /** * 小说数据实体类 序号 作者 作品 类型 进度 字数 作品积分 */ static class Novel { private String rank;//序号 private String author;//作者 private String title;//作品 private String type;//类型 private String progress;//进度 private String totalNUm;//字数 private String votes;//积分 private String time;//时间 private String jianjie;//简介 public Novel(String rank, String author, String title, String type, String progress, String totalNUm, String votes, String time, String jianjie) { this.rank = rank; this.author = author; this.title = title; this.type = type; this.progress = progress; this.totalNUm = totalNUm; this.votes = votes; this.time = time; this.jianjie = jianjie; } @Override public String toString() { return "Novel{" + "rank='" + rank + '\'' + ", author='" + author + '\'' + ", title='" + title + '\'' + ", type='" + type + '\'' + ", progress='" + progress + '\'' + ", totalNUm='" + totalNUm + '\'' + ", votes='" + votes + '\'' + ", time='" + time + '\'' + ", jianjie='" + jianjie + '\'' + '}'; } public String getRank() { return rank; } public void setRank(String rank) { this.rank = rank; } public String getAuthor() { return author; } public void setAuthor(String author) { this.author = author; } public String getType() { return type; } public void setType(String type) { this.type = type; } public String getTitle() { return title; } public void setTitle(String title) { this.title = title; } public String getProgress() { return progress; } public void setProgress(String progress) { this.progress = progress; } public String getTotalNUm() { return totalNUm; } public void setTotalNUm(String totalNUm) { this.totalNUm = totalNUm; } public String getTime() { return time; } public void setTime(String time) { this.time = time; } public String getVotes() { return votes; } public void setVotes(String votes) { this.votes = votes; } public String getJianjie() { return jianjie; } public void setJianjie(String jianjie) { this.jianjie = jianjie; } } }

你可能感兴趣的:(爬虫,java)