import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStreamWriter; import org.htmlparser.Node; import org.htmlparser.NodeFilter; import org.htmlparser.Parser; import org.htmlparser.filters.AndFilter; import org.htmlparser.filters.HasAttributeFilter; import org.htmlparser.filters.TagNameFilter; import org.htmlparser.tags.LinkTag; import org.htmlparser.util.NodeList; import org.htmlparser.util.ParserException; public class BaiduParse { public static void main(String[] args) throws Exception { String str = "http://tieba.baidu.com/p/1303669256"; String endPage = getEndPage(str); String url = null; for (int i = 1; !endPage.equals(url); i++) { url = str + "?pn=" + i; System.out .println("================================================" + url + "================================================"); String content = getContent(url); writeStringToFile(fileName(url) + ".html", content, "gb2312"); } } /** * 保存到本地的文件名称 * * @param name * 文件名称 * @return */ public static String fileName(String name) { String abc = name.split("/")[name.split("/").length - 1]; String aaa = abc.split("\\?")[0] + abc.split("\\?")[abc.split("\\?").length - 1]; return aaa; } /** * 找到最后一页的URL * * @param url * @return * @throws ParserException */ public static String getEndPage(String url) throws ParserException { String str = null; Parser parser = new Parser(); parser.setURL(url); parser.setEncoding("gb2312"); // 设置过滤器,只获取li标签,并且只有class属性为l_pager pager_theme_2的html节点(包括子节点) NodeFilter beginNodeFilter = new AndFilter(new TagNameFilter("li"), new HasAttributeFilter("class", "l_pager pager_theme_2")); NodeList nodeList = parser.extractAllNodesThatMatch(beginNodeFilter); if (nodeList != null && nodeList.size() > 0) { Node nameNode = nodeList.elementAt(nodeList.size() - 1); //得到最后一个链接 LinkTag n = (LinkTag) nameNode.getLastChild(); str = n.extractLink(); } parser.reset(); return str; } /** * 获取某个URL中的内容,这里只留下了br标签 * * @param url * @return * @throws ParserException */ public static String getContent(String url) throws ParserException { Parser parser = new Parser(); parser.setURL(url); parser.setEncoding("gb2312"); // 设置过滤器,只获取p标签,并且只有class属性为d_post_content的html节点(包括子节点) NodeFilter beginNodeFilter = new AndFilter(new TagNameFilter("p"), new HasAttributeFilter("class", "d_post_content")); // 执行解析得到所有节点集合 NodeList nodeList = parser.extractAllNodesThatMatch(beginNodeFilter); StringBuffer sb = new StringBuffer(); for (int i = 0; i < nodeList.size(); i++) { Node nameNode = nodeList.elementAt(i); NodeList cNodeList = nameNode.getChildren(); // 自定义解析器把包含a标签与包含img标签的节点去掉 NodeFilter nodeFilter = new NodeFilter() { private static final long serialVersionUID = 1L; public boolean accept(Node arg0) { // 如果包含a或者包含img跳过 if (arg0.toHtml().startsWith("<a") || arg0.toHtml().startsWith("<img")) { return false; } return true; } }; //对子节点进行a标签img标签过滤 cNodeList = cNodeList.extractAllNodesThatMatch(nodeFilter); for (int j = 0; j < cNodeList.size(); j++) { Node cnameNode = cNodeList.elementAt(j); sb.append(cnameNode.toHtml()); } } parser.reset(); return sb.toString(); } /** * 把字符串写入文件中 * * @param fileName * 文件名称 * @param content * 文件内容 * @param enc * 字符集编码 * @return * @throws IOException */ public static boolean writeStringToFile(String fileName, String content, String enc) throws IOException { File file = new File(fileName); try { if (file.isFile()) { file.deleteOnExit(); file = new File(file.getAbsolutePath()); } OutputStreamWriter os = null; if (enc == null || enc.length() == 0) { os = new OutputStreamWriter(new FileOutputStream(file)); } else { os = new OutputStreamWriter(new FileOutputStream(fileName), enc); } os.write(content); os.close(); } catch (Exception e) { e.printStackTrace(); return false; } return true; } }