httpclient+nekohtml 解析HTML

httpclient+nekohtml 解析HTML

httpclient+nekohtml 解析HTML

import org.cyberneko.html.parsers.DOMParser;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.w3c.dom.Document;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpException;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.commons.httpclient.methods.GetMethod;

import java.io.InputStream;
import java.io.IOException;

public class Html2XML {
    private int connectionTimeout = 5000;
    private int soTimeout = 12000;
    private String proxyHost = null;
    private int proxyPort;

    public Document getDocument(String url) {
        HttpClient client = new HttpClient();
        if (proxyHost != null) {
            client.getHostConfiguration().setProxy(proxyHost, proxyPort);
        }
        client.getHttpConnectionManager().getParams().setConnectionTimeout(connectionTimeout);
        client.getHttpConnectionManager().getParams().setSoTimeout(soTimeout);
        GetMethod method = new GetMethod(url);
        method.addRequestHeader("Content-Type", "text/html; charset=utf-8");
        try {
            int statusCode = client.executeMethod(method);
            if (statusCode != HttpStatus.SC_OK) {
                throw new HttpException("HttpStatusCode : " + statusCode);
            }
            InputStream is = method.getResponseBodyAsStream();
            DOMParser parser = new DOMParser();
            parser.setProperty("http://cyberneko.org/html/properties/default-encoding", "utf-8");
            parser.parse(new InputSource(is));
            return (parser.getDocument());
        } catch (HttpException he) {
            he.printStackTrace();
        } catch (IOException ie) {
            ie.printStackTrace();
        } catch (SAXException se) {
            se.printStackTrace();
        }
        return null;
    }
}

你可能感兴趣的:(httpclient+nekohtml 解析HTML)