把以前在百度空间收集的文章搬到javaeye了,主要用到的lib就是commons-httpclient和htmlparser,在此记录下一些关键的代码片段。
jar包清单
commons-codec-1.3.jar commons-httpclient-3.1.jar commons-lang.jar commons-logging-1.1.jar htmlparser.jar log4j-1.2.15.jar slf4j-api-1.5.8.jar slf4j-log4j12-1.5.8.jar
扩展 org.apache.commons.httpclient.HttpClient,覆盖其executeMethod方法处理cookie
package util; import java.io.IOException; import org.apache.commons.httpclient.Cookie; import org.apache.commons.httpclient.Header; import org.apache.commons.httpclient.HttpClient; import org.apache.commons.httpclient.HttpException; import org.apache.commons.httpclient.HttpMethod; import org.apache.commons.httpclient.HttpState; public class HttpClientEx extends HttpClient { private HttpState httpState = new HttpState(); // http状态对象,主要保存cookie private String cookie = ""; public int executeMethod(HttpMethod httpMethod) throws IOException, HttpException { String cookie = this.getCookie(); String uri = httpMethod.getURI().getHost(); httpState.addCookie(new Cookie(uri, "cookie", cookie, "/", null, false)); this.setState(httpState); int statues = super.executeMethod(httpMethod); Header[] headerArray = httpMethod.getResponseHeaders(); for (Header h : headerArray) { if (h.getName().trim().equalsIgnoreCase("Set-Cookie")) { if (!this.getCookie().equals("")) { // 如果值不为空 this.setCookie(this.getCookie() + ";" + h.getValue()); } else { this.setCookie(h.getValue()); } } } return statues; } public String getCookie() { return cookie; } public void setCookie(String cookie) { this.cookie = cookie; } }
get url
String url = HTTP_HI_BAIDU_COM + USER_ID + "/blog"; HttpClient client = new HttpClientEx(); GetMethod getMethod = new GetMethod(url); client.executeMethod(getMethod); String body = new String(getMethod.getResponseBody(), getMethod.getResponseCharSet()); getMethod.releaseConnection(); logger.debug("日志列表页面\n{}", body);
分析html页面中的div元素
Parser parser = Parser.createParser(body, getMethod.getResponseCharSet()); NodeFilter filter = new TagNameFilter("div"); NodeList nodeList = parser.parse(filter); for (int i = 0; i < nodeList.size(); i++) { Div div = (Div) nodeList.elementAt(i); if ("m_blog".equals(div.getAttribute("id"))) { logger.debug("id为m_blog的div内容\n{}", div.toHtml()); } }
查找含有特定文字的节点集合
NodeList searchFor = div.searchFor("类别");
设置User-Agent和post数据字符编码
private static final String USER_AGENT = "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; iCafeMedia; InfoPath.2)"; private static final String CHARSET = "UTF-8"; HostParams params = new HostParams(); params.setParameter(HttpMethodParams.USER_AGENT,USER_AGENT); params.setParameter(HttpMethodParams.HTTP_CONTENT_CHARSET, CHARSET); client.getHostConfiguration().setParams(params);
post url
String url = HOST + "/login"; PostMethod postMethod = new PostMethod(url); postMethod.setParameter("name", "fangwei"); postMethod.setParameter("password", "******"); client.executeMethod(postMethod);