WebMagic

WebMagic

WebMagic是一个简单灵活的Java爬虫框架。基于WebMagic,你可以快速开发出一个高效、易维护的爬虫。
但是不能抓取敏感数据涉及到了法律

特性:

  • 简单的API,可快速上手
  • 模块化的结构,可轻松扩展
  • 提供多线程和分布式支持

插件:XPath Helper
Maven依赖:

<dependency>
    <groupId>us.codecraft</groupId>  
    <artifactId>webmagic-core</artifactId> 
    <version>0.7.3</version> </dependency>
<dependency>  
    <groupId>us.codecraft</groupId>  
    <artifactId>webmagic-extension</artifactId> 
    <version>0.7.3</version>
</dependency>

代码示例

@Component
public class BookPage implements PageProcessor {
    private Logger logger= LoggerFactory.getLogger(BookPage.class);
    private Site site = Site.me().setRetryTimes(3).setSleepTime(1000).setTimeOut(10000);
    @Override
    public void process(Page page) {
        logger.warn("图片"+ page.getHtml().css("div#sidebar div#fmimg img", "src").get());
        logger.warn("标题:"+page.getHtml().xpath("//*[@id=info]/h1/text()").get());
        logger.warn("作者:"+page.getHtml().xpath("//*[@id=info]/p[1]/text()").get().substring(7));
        //获取下面的所有标签
        List<Selectable> nodes = page.getHtml().xpath("//*[@id=list]/dl/dd").nodes();
        System.err.println(nodes);
        nodes.forEach(n->{
            logger.warn("章节:"+n.xpath("/dd/a/text()").get());
            logger.warn("地址: https://www.um16.cn"+ n.xpath("/dd/a/@href").get());
        });
    }
    @Override
    public Site getSite() {
        return site;
    }
}

测试

  @Resource
    private BookPage page;
    @Test
    void contextLoads() {
        //爬区一页的数据
        Spider.create(page).addUrl("https://www.um16.cn/info/37609.html").thread(1).run();
    }
    //爬取20页数据
    @Test
    public void t2(){
        String[] urls=new String[20];
        for(int i=1;i<=20;i++){
            urls[i-1]="https://www.um16.cn/info/"+i+".html";
        }
        Spider.create(page).addUrl(urls).thread(5).run();
    }

[外链图片转存失败,源站可能有防盗image!链机制,建png]接上传mhttps://g-2Vblog.csdnimg.cn/img_convert/bc40bd914d4314fda8d7a874cec61e.png#clientId=u73b0f3a7-b80d-4&crop=0&crop=0&crop=1&crop=1&from=paste&height=670&id=u3e3df055&margin=[object Object]&name=image.png&originHeight=838&originWidth=1735&originalType=binary&ratio=1&rotation=0&showTitle=false&size=221422&status=done&style=none&taskId=u1aa3f912-84df-4faa-8d0d-d9e9c1ff352&title=&width=1388[https://img-blog.csdnimg.cn/img_convert/bc40bd914d43d214fda8d7a874cec61e.png#clientId=u73b0f3a7-b80d-4&crop=0&crop=0&crop=1&crop=1&from=paste&height=670&id=u3e3df055&margin=[object Object]&name=image.png&originHeight=838&originWidth=1735&originalType=binary&ratio=1&rotation=0&showTitle=false&size=221422&status=done&style=none&taskId=u1aa3f912-84df-4faa-8d0d-d9e9c1ff352&title=&width=1388](https://img-blog.csdnimg.cn/img_convert/bc40bd914d43d214fda8d7a874cec61e.png#clientId=u73b0f3a7-b80d-4&crop=0&crop=0&crop=1&crop=1&from=paste&height=670&id=u3e3df055&margin=%5Bobject%20Object%5D&name=image.png&originHeight=838&originWidth=1735&originalType=binary&ratio=1&rotation=0&showTitle=false&size=221422&status=done&style=none&taskId=u1aa3f912-84df-4faa-8d0d-d9e9c1ff352&title=&width=1388#pic_center)]
)

将数据添加到数据库和es中
image.png

@Component
public class BookPage implements PageProcessor {
    private Logger logger= LoggerFactory.getLogger(BookPage.class);
    private Site site = Site.me().setRetryTimes(3).setSleepTime(1000).setTimeOut(10000);
    //操作数据库
    @Resource
    private PcDao dao;
    @Resource
    private PcAllDao allDao;
    //将数据添加到Es
    @Resource
    private PcDocumentDao documentDao;
    @Override
    public void process(Page page) {
        Pc pc=new Pc();
        //图片
        pc.setImg(page.getHtml().css("div#sidebar div#fmimg img", "src").get());
        //标题
        pc.setTitle(page.getHtml().xpath("//*[@id=info]/h1/text()").get());
        //作者
        pc.setAuto(page.getHtml().xpath("//*[@id=info]/p[1]/text()").get().substring(7));
        //添加
        if(dao.insert(pc)>0){
            //将数据添加到es
            PcDocument pcDocument = new PcDocument();
            pcDocument.setImg(page.getHtml().css("div#sidebar div#fmimg img", "src").get());
            pcDocument.setTitle(page.getHtml().xpath("//*[@id=info]/h1/text()").get());
            pcDocument.setAuto(page.getHtml().xpath("//*[@id=info]/p[1]/text()").get().substring(7));
            documentDao.save(pcDocument);
            logger.warn("书籍添加成功");
            //获取下面的所有标签
            List<Selectable> nodes = page.getHtml().xpath("//*[@id=list]/dl/dd").nodes();
            nodes.forEach(n->{
                PcAll all=new PcAll();
                all.setPid(pc.getId());
                //章节
                all.setTitle(n.xpath("/dd/a/text()").get());
                //详细地址
                all.setImgs( "https://www.um16.cn/"+n.xpath("/dd/a/@href").get());
                pcDocument.setId(UUIDUtils.getUUIDInOrderId());
                pcDocument.setName(n.xpath("/dd/a/text()").get());
                pcDocument.setImgs( "https://www.um16.cn/"+n.xpath("/dd/a/@href").get());
                allDao.insert(all);
                documentDao.save(pcDocument);
                logger.warn("书籍详细添加成功");
            });
        }
    }
    @Override
    public Site getSite() {
        return site;
    }
}

查看数据
image.png

image.png

你可能感兴趣的:(SpringCloud,爬虫,java,python)