定位:专注HTML解析的轻量级库(也就是快,但动态页面无法抓取)
核心能力:
DOM树解析与CSS选择器查询
HTML净化与格式化
支持元素遍历与属性提取
应用场景:静态页面数据抽取、内容清洗
public static Document getJsoupDoc(String url, Integer frequency, Integer connectTimeout) {
Document document = null;
try {
if(connectTimeout==null){
document = Jsoup.connect(url).ignoreContentType(true).get();
}else{
document = Jsoup.connect(url).ignoreContentType(true).maxBodySize(0).timeout(connectTimeout).get();
}
} catch (Exception e) {
document = null;
}
if (document == null && frequency < 3) {
frequency = frequency + 1;
try {
Thread.sleep(100);
} catch (InterruptedException e) {
log.error("休眠异常:" + e.getMessage(), e);
}
document = getJsoupDoc(url, frequency, connectTimeout);
}
return initUrl(url,document);
}
定位:支持JavaScript的全功能浏览器模拟器(js动态数据的加载)
核心能力:
执行复杂AJAX请求
模拟用户交互(点击/表单提交)
支持Cookie管理和页面跳转
典型场景:动态网页抓取、自动化测试
/**
* @param url 爬虫链接
* @param waitTime 等待时间
* @return
*/
public static Document getDynamicCrawlersDocument(String url, Integer waitTime, boolean javaScriptEnabled) {
Document document = null;
try (WebClient browser = new WebClient()) {
//解决动态页面抓取不到信息问题
browser.getOptions().setCssEnabled(false);
browser.getOptions().setJavaScriptEnabled(javaScriptEnabled);
browser.getOptions().setThrowExceptionOnScriptError(false);
browser.getOptions().setUseInsecureSSL(true);
// 设置自定义的错误处理类
browser.setJavaScriptErrorListener(new MyJSErrorListener());
HtmlPage page = null;
page = browser.getPage(url);
// 等待后台脚本执行时间
browser.waitForBackgroundJavaScript(waitTime);
String pageAsXml = page.asXml();
document = Jsoup.parse(pageAsXml.replaceAll("\\<\\?xml.*?\\?>", ""));
document.setBaseUri(url);
} catch (ScriptException e) {
log.error("getDynamicCrawlersDocument页面:{} JavaScript 异常:{}", url, e.getMessage());
return initUrl(url,document);
} catch (UnknownHostException e) {
log.error("getDynamicCrawlersDocument页面:{} 无法解析或找到指定的主机名:{}", url, e.getMessage());
return initUrl(url,document);
} catch (FailingHttpStatusCodeException e) {
log.error("getDynamicCrawlersDocument页面:{} HTTP 状态异常:{}", url, e.getStatusCode());
return initUrl(url,document);
} catch (Exception e) {
log.error("getDynamicCrawlersDocument页面:{} 获取页面异常:{}", url, e.getMessage());
return initUrl(url,document);
}
return initUrl(url,document);
}
特性 | Jsoup | HtmlUnit |
---|---|---|
解析速度 | ⚡️ 毫秒级响应 | ⏳ 需加载完整页面资源 |
JS支持 | ❌ 不执行任何脚本 | ✅ 完整JavaScript引擎 |
内存占用 | 10MB级内存消耗 | 100MB+内存需求 |
学习曲线 | 半天掌握核心API | 需理解浏览器事件模型 |
反爬绕过 | ❌ 基础Header支持 | ✅ 模拟真实浏览器指纹 |
目标数据存在于初始HTML中(静态页面)
需要高频抓取(>1000次/分钟)
服务器资源受限(云函数/边缘计算)
快速原型开发需求
页面依赖AJAX动态加载(js数据请求)
需要登录Cookie保持
涉及表单交互操作
需解析Shadow DOM内容
Jsoup与HtmlUnit代表了Java爬虫的两个技术维度:极致效率与完整模拟。理解二者的设计哲学,根据实际场景灵活选用甚至组合使用(如用HtmlUnit获取初始页面后用Jsoup解析),往往能取得最佳效果。在日益复杂的反爬机制下,合理选择工具将成为数据抓取成功的关键。
完整代码工具类
package com.zzkj.zei.utils;
import cn.hutool.http.HttpRequest;
import cn.hutool.http.HttpResponse;
import com.alibaba.fastjson.JSON;
import com.zzkj.zei.pojo.system.SysSite;
import com.zzkj.zei.utils.spider.SpiderUtils;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.ObjectUtils;
import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.entity.StringEntity;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.htmlunit.BrowserVersion;
import org.htmlunit.FailingHttpStatusCodeException;
import org.htmlunit.ScriptException;
import org.htmlunit.WebClient;
import org.htmlunit.html.HtmlAnchor;
import org.htmlunit.html.HtmlPage;
import org.htmlunit.javascript.DefaultJavaScriptErrorListener;
import org.jetbrains.annotations.NotNull;
import org.jsoup.HttpStatusException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.IOException;
import java.net.*;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* FileName: JsoupHtmlUintUtils
* Author: wzk
* Date:2024/11/8 9:32
*/
@Slf4j
public class JsoupHtmlUintUtils {
/**
* 动态检测
*
* @param url 爬虫链接
* @return
*/
public static Document getDynamicCrawlersDocument(String url) {
Document document = null;
//解决动态页面抓取不到信息问题
WebClient browser = new WebClient(BrowserVersion.CHROME);
browser.getOptions().setCssEnabled(false);
browser.getOptions().setJavaScriptEnabled(false);
browser.getOptions().setThrowExceptionOnScriptError(false);
// 允许使用不安全的 SSL
browser.getOptions().setUseInsecureSSL(true);
// 设置自定义的错误处理类
browser.setJavaScriptErrorListener(new MyJSErrorListener());
HtmlPage page = null;
try {
page = browser.getPage(url);
// 等待后台脚本执行时间
browser.waitForBackgroundJavaScript(1000);
String pageAsXml = page.asXml();
document = Jsoup.parse(pageAsXml);
} catch (ScriptException e) {
log.info("页面:{} JavaScript 异常:{}", url, e.getMessage());
} catch (FailingHttpStatusCodeException e) {
log.info("页面:{} HTTP 状态异常:{}", url, e.getStatusCode());
} catch (UnknownHostException e) {
log.info("页面:{} 无法解析或找到指定的主机名:{}", url, e.getMessage());
} catch (Exception e) {
log.error("页面:{} 获取页面异常:{}", url, e.getMessage());
}
return initUrl(url,document);
}
/**
* @param url 爬虫链接
* @param waitTime 等待时间
* @return
*/
public static Document getDynamicCrawlersDocument(String url, Integer waitTime, boolean javaScriptEnabled) {
Document document = null;
try (WebClient browser = new WebClient()) {
//解决动态页面抓取不到信息问题
browser.getOptions().setCssEnabled(false);
browser.getOptions().setJavaScriptEnabled(javaScriptEnabled);
browser.getOptions().setThrowExceptionOnScriptError(false);
browser.getOptions().setUseInsecureSSL(true);
// 设置自定义的错误处理类
browser.setJavaScriptErrorListener(new MyJSErrorListener());
HtmlPage page = null;
page = browser.getPage(url);
// 等待后台脚本执行时间
browser.waitForBackgroundJavaScript(waitTime);
String pageAsXml = page.asXml();
document = Jsoup.parse(pageAsXml.replaceAll("\\<\\?xml.*?\\?>", ""));
document.setBaseUri(url);
} catch (ScriptException e) {
log.error("getDynamicCrawlersDocument页面:{} JavaScript 异常:{}", url, e.getMessage());
return initUrl(url,document);
} catch (UnknownHostException e) {
log.error("getDynamicCrawlersDocument页面:{} 无法解析或找到指定的主机名:{}", url, e.getMessage());
return initUrl(url,document);
} catch (FailingHttpStatusCodeException e) {
log.error("getDynamicCrawlersDocument页面:{} HTTP 状态异常:{}", url, e.getStatusCode());
return initUrl(url,document);
} catch (Exception e) {
log.error("getDynamicCrawlersDocument页面:{} 获取页面异常:{}", url, e.getMessage());
return initUrl(url,document);
}
return initUrl(url,document);
}
private static List getDynamicCrawlersDocument(String url, Integer waitTime) {
List documents = new ArrayList<>();
HtmlPage oldPage = null;
try (WebClient browser = new WebClient()) {
//解决动态页面抓取不到信息问题
browser.getOptions().setCssEnabled(false);
browser.getOptions().setJavaScriptEnabled(true);
browser.getOptions().setThrowExceptionOnScriptError(false);
browser.getOptions().setUseInsecureSSL(true);
// 设置自定义的错误处理类
browser.setJavaScriptErrorListener(new MyJSErrorListener());
HtmlPage page = null;
page = browser.getPage(url);
oldPage = page;
// 等待后台脚本执行时间
browser.waitForBackgroundJavaScript(waitTime);
Document document;
document = getDocuments(url, page);
documents.add(document);
while (true) {
HtmlAnchor nextButton = page.getFirstByXPath("//a[contains(text(), '下一页')]");
if (nextButton == null || nextButton.getAttribute("class").contains("disabled")) {
break; // No more pages
}
page = nextButton.click();
browser.waitForBackgroundJavaScript(waitTime);
if (page.equals(oldPage) && !page.getUrl().toString().equals(url)) {
break;
}
oldPage = page;
document = getDocuments(url, page);
documents.add(document);
}
} catch (ScriptException e) {
log.error("getDynamicCrawlersDocument页面:{} JavaScript 异常:{}", url, e.getMessage());
} catch (UnknownHostException e) {
log.error("getDynamicCrawlersDocument页面:{} 无法解析或找到指定的主机名:{}", url, e.getMessage());
} catch (FailingHttpStatusCodeException e) {
log.error("getDynamicCrawlersDocument页面:{} HTTP 状态异常:{}", url, e.getStatusCode());
} catch (Exception e) {
log.error("getDynamicCrawlersDocument页面:{} 获取页面异常:{}", url, e.getMessage());
}
return documents;
}
private static @NotNull Document getDocuments(String url, HtmlPage page) {
String pageAsXml = page.asXml();
Document document = Jsoup.parse(pageAsXml.replaceAll("\\<\\?xml.*?\\?>", ""));
document.setBaseUri(url);
return initUrl(url,document);
}
public static List getDocuments(String url, Integer isDynamic) {
List list;
if (isDynamic == 1) {
list = getDynamicCrawlersDocument(url, 1000);
} else {
list = getJsoupDoc(url);
}
return list;
}
public static Document getDocument(String url, Integer isDynamic) {
Document document;
if (isDynamic == 1) {
document = getDynamicCrawlersDocument(url, 1000, true);
} else {
document = getJsoupDoc(url, 1, null);
}
return initUrl(url,document);
}
/**
* @param url 爬虫链接
* @return
*/
public static Document getJsoupDoc(String url, Integer frequency, Integer connectTimeout) {
Document document = null;
try {
if(connectTimeout==null){
document = Jsoup.connect(url).ignoreContentType(true).get();
}else{
document = Jsoup.connect(url).ignoreContentType(true).maxBodySize(0).timeout(connectTimeout).get();
}
} catch (Exception e) {
document = null;
}
if (document == null && frequency < 3) {
frequency = frequency + 1;
try {
Thread.sleep(100);
} catch (InterruptedException e) {
log.error("休眠异常:" + e.getMessage(), e);
}
document = getJsoupDoc(url, frequency, connectTimeout);
}
return initUrl(url,document);
}
private static List getJsoupDoc(String url) {
List list = new ArrayList<>();
Document document = getJsoupDoc(url, 1, null);
list.add(document);
return list;
}
public static String getRedirectUrl(String url) {
log.info("getRedirectUrl-------------------url---------------" + url);
String redirectUrl = "";
//设置模拟浏览器
try (WebClient webClient = new WebClient(BrowserVersion.CHROME)) {
//是否等待页面javaScrpit加载
webClient.getOptions().setJavaScriptEnabled(true);
webClient.getOptions().setRedirectEnabled(true);
// js运行错误时,是否抛出异常
webClient.getOptions().setThrowExceptionOnScriptError(false);
webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);
// 设置连接超时时间
webClient.getOptions().setTimeout(200);
// HtmlUnit
redirectUrl = webClient.getPage(url).getUrl().toString();
} catch (FailingHttpStatusCodeException | IOException e) {
log.error(url + "获取重定向网站失败1:" + e.getMessage(), e);
} catch (Exception e) {
log.error(url + "获取重定向网站失败2:" + e.getMessage(), e);
}
return redirectUrl;
}
/**
* 获取重定向url
*
* @param hrefUrl 链接地址
* @param metaTagsUrl 元标签地址
* @param sysSite 站点实体
* @return
*/
public static String getRedirectUrl(String hrefUrl, String metaTagsUrl, SysSite sysSite) {
String redirectUrl = "";
try {
if (metaTagsUrl.startsWith("./") && SpiderUtils.isNode(hrefUrl, sysSite)) {
if (hrefUrl.endsWith("/")) {
redirectUrl = hrefUrl + metaTagsUrl.substring(2);
} else {
redirectUrl = hrefUrl + metaTagsUrl.substring(1);
}
} else if (metaTagsUrl.startsWith("./") && hrefUrl.endsWith(".html")) {
hrefUrl = hrefUrl.substring(0, hrefUrl.lastIndexOf("/"));
metaTagsUrl = metaTagsUrl.substring(1);
redirectUrl = hrefUrl + metaTagsUrl;
} else if ("../".equals(metaTagsUrl) && SpiderUtils.isNode(hrefUrl, sysSite)) {
if (hrefUrl.endsWith("/")) {
hrefUrl = hrefUrl.substring(0, hrefUrl.length() - 1);
}
redirectUrl = hrefUrl.substring(0, hrefUrl.lastIndexOf('/'));
} else if ("/".equals(metaTagsUrl)) {
redirectUrl = sysSite.getSiteDomain();
} else {
//SpiderUtils.saveLogText("需要获取重定向以后的url--------------------hrefUrl:"+hrefUrl+"--------metaTagsUrl:"+metaTagsUrl);
redirectUrl = JsoupHtmlUintUtils.getRedirectUrl(hrefUrl);
//SpiderUtils.saveLogText("需要获取重定向以后的url-----------返回结果---------redirectUrl:"+redirectUrl);
}
} catch (Exception e) {
log.error("获取的url失败:" + e.getMessage(), e);
}
return redirectUrl;
}
/**
* 获取原标签的url
*
* @param refreshMeta
* @return
*/
public static String getMetaTagsUrl(Element refreshMeta) {
String refreshUrl = "";
try {
if (refreshMeta != null) {
String patternString = "http-equiv\\s*=\\s*\"?Refresh\"?\\s*[\\s;]*content\\s*=\\s*\"?(\\d+);\\s*url\\s*=\\s*(\"?)(.*?)\\2\"";
Pattern pattern = Pattern.compile(patternString, Pattern.CASE_INSENSITIVE);
Matcher matcher = pattern.matcher(refreshMeta.html());
if (matcher.find()) {
refreshUrl = matcher.group(3);
}
}
} catch (Exception e) {
log.error("获取元标签的url失败:" + e.getMessage(), e);
}
return refreshUrl;
}
/**
* 获取链接的状态码
*
* @param url 爬虫链接
* @return
*/
public static Integer getUrlResponseCode(String url, Integer frequency) {
int statusCode;
try (HttpResponse response = HttpRequest.head(url).setConnectionTimeout(1000).execute()) {
//使用hutool方法获取状态码
statusCode = response.getStatus();
if (statusCode >= 400 && frequency < 3) {
frequency = frequency + 1;
try {
Thread.sleep(200);
} catch (InterruptedException e) {
log.error("休眠异常:" + e.getMessage(), e);
}
statusCode = getUrlResponseCode(url, frequency);
}
} catch (Exception e) {
log.error(url+"-----获取url的状态码失败:" + e.getMessage(), e);
statusCode = 500;
}
return statusCode;
}
/**
* 静态爬虫
*
* @param url
* @return
*/
private Document getStaticCrawlers(String url) {
Document document = null;
try {
document = Jsoup.connect(url).timeout(5000).get();
} catch (HttpStatusException e) {
// 后台异常处理
if ((e.getStatusCode() + "").startsWith("5")) {
try {
Thread.sleep(2000); // 睡眠2秒
document = Jsoup.connect(url).timeout(5000).get();
} catch (IOException ex) {
ex.getMessage();
} catch (InterruptedException ex) {
throw new RuntimeException(ex);
}
}
} catch (Exception e) {
e.printStackTrace();
}
return initUrl(url,document);
}
private Document getStaticCrawlers(String url, Integer waitTime) {
Document document = null;
try {
document = Jsoup.connect(url).timeout(waitTime).get();
} catch (HttpStatusException e) {
// 后台异常处理
if ((e.getStatusCode() + "").startsWith("5")) {
try {
Thread.sleep(2000); // 睡眠2秒
document = Jsoup.connect(url).timeout(waitTime).get();
} catch (IOException ex) {
ex.getMessage();
} catch (InterruptedException ex) {
throw new RuntimeException(ex);
}
}
} catch (Exception e) {
}
return initUrl(url,document);
}
/**
* 初始化Document中的相对路径为绝对路径
* @param sourceUrl 基准URL,用于解析相对路径
* @param document Jsoup解析的Document对象
* @return 处理后的Document
* @throws IllegalArgumentException 如果基准URL无效
*/
public static Document initUrl(String sourceUrl, Document document) {
try{
if (ObjectUtils.isNotEmpty(document)){
URI baseUri;
try {
baseUri = new URI(sourceUrl);
} catch (URISyntaxException e) {
throw new IllegalArgumentException("链接处理异常: " + sourceUrl, e);
}
Elements aList = document.select("a");
for (Element element : aList) {
String href = element.attr("href");
// 跳过空或无效的href属性
if (href == null || href.isEmpty()) {
continue;
}
//是javascript:void(0)类似这样的非法链接
if (SpiderUtils.filterJavaScript(href)) {
continue;
}
//不符合url规则
if (SpiderUtils.illegalUrl(href)) {
continue;
}
try {
URI resolvedUri = baseUri.resolve(href);
element.attr("href", resolvedUri.toString());
} catch (IllegalArgumentException e) {
// 可选:记录解析失败的情况
log.error("无法解析链接 '" + href + "': " + e.getMessage());
}
}
}
} catch (Exception e){
log.info("document初始化链接异常:",e.getMessage(),e);
}
return document;
}
static class MyJSErrorListener extends DefaultJavaScriptErrorListener {
@Override
public void scriptException(HtmlPage page, ScriptException scriptException) {
}
@Override
public void timeoutError(HtmlPage page, long allowedTime, long executionTime) {
}
@Override
public void malformedScriptURL(HtmlPage page, String url, MalformedURLException malformedURLException) {
}
@Override
public void loadScriptError(HtmlPage page, URL scriptUrl, Exception exception) {
}
@Override
public void warn(String message, String sourceName, int line, String lineSource, int lineOffset) {
}
}
}