首先判断协议是否为http
if (!"http".equals(url.getProtocol())) throw new HttpException("Not an HTTP url:" + url);
然后根据url获取到主机名和端口名。如果端口不存在,则端口默认为80,请求的地址将不包括端口号portString= "",否则获取到端口号,并得到portString
String host = url.getHost(); int port; String portString; if (url.getPort() == -1) { port= 80; portString= ""; } else { port= url.getPort(); portString= ":" + port; }
然后创建socket,并且设置连接超时的时间:
socket = new Socket(); // create the socket socket.setSoTimeout(http.getTimeout());
String sockHost = http.useProxy() ? http.getProxyHost() : host; int sockPort = http.useProxy() ? http.getProxyPort() : port;
创建InetSocketAddress,并且开始建立连接:
InetSocketAddress sockAddr= new InetSocketAddress(sockHost, sockPort); socket.connect(sockAddr, http.getTimeout());
获取输入流:
// make request OutputStream req = socket.getOutputStream();
以下代码用来向服务器发Get请求:
StringBuffer reqStr = new StringBuffer("GET "); if (http.useProxy()) { reqStr.append(url.getProtocol()+"://"+host+portString+path); } else { reqStr.append(path); } reqStr.append(" HTTP/1.0\r\n"); reqStr.append("Host: "); reqStr.append(host); reqStr.append(portString); reqStr.append("\r\n"); reqStr.append("Accept-Encoding: x-gzip, gzip\r\n"); String userAgent = http.getUserAgent(); if ((userAgent == null) || (userAgent.length() == 0)) { if (Http.LOG.isFatalEnabled()) { Http.LOG.fatal("User-agent is not set!"); } } else { reqStr.append("User-Agent: "); reqStr.append(userAgent); reqStr.append("\r\n"); } reqStr.append("\r\n"); byte[] reqBytes= reqStr.toString().getBytes(); req.write(reqBytes); req.flush();
接着来处理相应,获得输入流并且包装成PushbackInputStream来方便操作:
PushbackInputStream in = // process response new PushbackInputStream( new BufferedInputStream(socket.getInputStream(), Http.BUFFER_SIZE), Http.BUFFER_SIZE) ;
提取状态码和响应中的HTML的header:
boolean haveSeenNonContinueStatus= false; while (!haveSeenNonContinueStatus) { // parse status code line this.code = parseStatusLine(in, line); // parse headers parseHeaders(in, line); haveSeenNonContinueStatus= code != 100; // 100 is "Continue" }
接着读取内容:
readPlainContent(in);
获取内容的格式,如果是压缩的则处理压缩
String contentEncoding = getHeader(Response.CONTENT_ENCODING); if ("gzip".equals(contentEncoding) || "x-gzip".equals(contentEncoding)) { content = http.processGzipEncoded(content, url); } else { if (Http.LOG.isTraceEnabled()) { Http.LOG.trace("fetched " + content.length + " bytes from " + url); } }
请求的状态行一般格式(例如响应Ok的话) HTTP/1.1 200" 或 "HTTP/1.1 200 OK
int codeStart = line.indexOf(" "); int codeEnd = line.indexOf(" ", codeStart+1);
如果是第一种情况:
if (codeEnd == -1) codeEnd = line.length();
int code; try { code= Integer.parseInt(line.substring(codeStart+1, codeEnd)); } catch (NumberFormatException e) { throw new HttpException("bad status line '" + line + "': " + e.getMessage(), e); }
下面看看
private void parseHeaders(PushbackInputStream in, StringBuffer line) throws IOException, HttpException:
如果没有空行,那紧接着就是正文了,正文一般会以<!DOCTYPE、<HTML、<html开头。如果读到的一行中包含这个,那么header部分就读完了。
// handle HTTP responses with missing blank line after headers int pos; if ( ((pos= line.indexOf("<!DOCTYPE")) != -1) || ((pos= line.indexOf("<HTML")) != -1) || ((pos= line.indexOf("<html")) != -1) )
接着把多读的那部分压回流中,并设置那一行的长度为pos
in.unread(line.substring(pos).getBytes("UTF-8")); line.setLength(pos);
接着把对一行的处理委托给processHeaderLine(line)来处理:
try { //TODO: (CM) We don't know the header names here //since we're just handling them generically. It would //be nice to provide some sort of mapping function here //for the returned header names to the standard metadata //names in the ParseData class processHeaderLine(line); } catch (Exception e) { // fixme: e.printStackTrace(LogUtil.getErrorStream(Http.LOG)); } return; } processHeaderLine(line);
这样我们就比较容易理解下面的代码了:
int colonIndex = line.indexOf(":"); // key is up to colon
如果没有”:”并且这行不是空行则抛出HttpException异常
if (colonIndex == -1) { int i; for (i= 0; i < line.length(); i++) if (!Character.isWhitespace(line.charAt(i))) break; if (i == line.length()) return; throw new HttpException("No colon in header:" + line); }
最后放到headers中:
String key = line.substring(0, colonIndex); int valueStart = colonIndex+1; // skip whitespace while (valueStart < line.length()) { int c = line.charAt(valueStart); if (c != ' ' && c != '\t') break; valueStart++; } String value = line.substring(valueStart); headers.set(key, value);
如果是\r并且下一个字符是\n则读入\r,如果是\n,并且如果line.length() > 0,也就是这行前面已经有非空白字符,并且还允许连续行,在读一个字符,如果是’ ’或者是\t说明此行仍未结束,读入该字符,一行结束,返回读取的实际长度。其他情况下直接往line追加所读的字符:
line.setLength(0); for (int c = in.read(); c != -1; c = in.read()) { switch (c) { case '\r': if (peek(in) == '\n') { in.read(); } case '\n': if (line.length() > 0) { // at EOL -- check for continued line if the current // (possibly continued) line wasn't blank if (allowContinuedLine) switch (peek(in)) { case ' ' : case '\t': // line is continued in.read(); continue; } } return line.length(); // else complete default : line.append((char)c); } } throw new EOFException(); }
首先从headers(在此之前已经读去了headers放到metadata中了)中获取响应的长度,
int contentLength = Integer.MAX_VALUE; // get content length String contentLengthString = headers.get(Response.CONTENT_LENGTH); if (contentLengthString != null) { contentLengthString = contentLengthString.trim(); try { contentLength = Integer.parseInt(contentLengthString); } catch (NumberFormatException e) { throw new HttpException("bad content length: "+contentLengthString); } }
则截取maxContent那么长的字段:
if (http.getMaxContent() >= 0 && contentLength > http.getMaxContent()) // limit download size contentLength = http.getMaxContent(); ByteArrayOutputStream out = new ByteArrayOutputStream(Http.BUFFER_SIZE); byte[] bytes = new byte[Http.BUFFER_SIZE]; int length = 0; // read content for (int i = in.read(bytes); i != -1; i = in.read(bytes)) { out.write(bytes, 0, i); length += i; if (length >= contentLength) break; } content = out.toByteArray(); }