/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.protocol.httpclient;
// JDK imports
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
// HTTP Client imports
import org.apache.commons.httpclient.Header;
import org.apache.commons.httpclient.HttpVersion;
import org.apache.commons.httpclient.cookie.CookiePolicy;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.params.HttpMethodParams;
import org.apache.commons.httpclient.HttpException;
// Nutch imports
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.metadata.SpellCheckedMetadata;
import org.apache.nutch.net.protocols.HttpDateFormat;
import org.apache.nutch.net.protocols.Response;
import org.apache.nutch.protocol.http.api.HttpBase;
/**
* An HTTP response.
*
* @author Susam Pal
*/
public class HttpResponse implements Response {
private URL url;
private byte[] content;
private int code;
private Metadata headers = new SpellCheckedMetadata();
/**
* Fetches the given <code>url</code> and prepares HTTP response.
*
* @param http An instance of the implementation class
* of this plugin
* @param url URL to be fetched
* @param datum Crawl data
* @param followRedirects Whether to follow redirects; follows
* redirect if and only if this is true
* @return HTTP response
* @throws IOException When an error occurs
*/
HttpResponse(Http http, URL url, CrawlDatum datum,
boolean followRedirects) throws IOException {
// Prepare GET method for HTTP request
this.url = url;
GetMethod get = new GetMethod(url.toString());
get.setFollowRedirects(followRedirects);
get.setDoAuthentication(true);
if (datum.getModifiedTime() > 0) {
get.setRequestHeader("If-Modified-Since",
HttpDateFormat.toString(datum.getModifiedTime()));
}
// Set HTTP parameters
HttpMethodParams params = get.getParams();
if (http.getUseHttp11()) {
params.setVersion(HttpVersion.HTTP_1_1);
} else {
params.setVersion(HttpVersion.HTTP_1_0);
}
params.makeLenient();
params.setContentCharset("UTF-8");
params.setCookiePolicy(CookiePolicy.BROWSER_COMPATIBILITY);
params.setBooleanParameter(HttpMethodParams.SINGLE_COOKIE_HEADER, true);
// XXX (ab) not sure about this... the default is to retry 3 times; if
// XXX the request body was sent the method is not retried, so there is
// XXX little danger in retrying...
// params.setParameter(HttpMethodParams.RETRY_HANDLER, null);
try {
code = Http.getClient().executeMethod(get);
Header[] heads = get.getResponseHeaders();
for (int i = 0; i < heads.length; i++) {
headers.set(heads[i].getName(), heads[i].getValue());
}
// Limit download size
int contentLength = Integer.MAX_VALUE;
String contentLengthString = headers.get(Response.CONTENT_LENGTH);
if (contentLengthString != null) {
try {
contentLength = Integer.parseInt(contentLengthString.trim());
} catch (NumberFormatException ex) {
throw new HttpException("bad content length: " +
contentLengthString);
}
}
if (http.getMaxContent() >= 0 &&
contentLength > http.getMaxContent()) {
contentLength = http.getMaxContent();
}
// always read content. Sometimes content is useful to find a cause
// for error.
InputStream in = get.getResponseBodyAsStream();
try {
byte[] buffer = new byte[HttpBase.BUFFER_SIZE];
int bufferFilled = 0;
int totalRead = 0;
ByteArrayOutputStream out = new ByteArrayOutputStream();
while ((bufferFilled = in.read(buffer, 0, buffer.length)) != -1
&& totalRead < contentLength) {
totalRead += bufferFilled;
out.write(buffer, 0, bufferFilled);
}
content = out.toByteArray();
} catch (Exception e) {
if (code == 200) throw new IOException(e.toString());
// for codes other than 200 OK, we are fine with empty content
} finally {
in.close();
get.abort();
}
StringBuilder fetchTrace = null;
if (Http.LOG.isTraceEnabled()) {
// Trace message
fetchTrace = new StringBuilder("url: " + url +
"; status code: " + code +
"; bytes received: " + content.length);
if (getHeader(Response.CONTENT_LENGTH) != null)
fetchTrace.append("; Content-Length: " +
getHeader(Response.CONTENT_LENGTH));
if (getHeader(Response.LOCATION) != null)
fetchTrace.append("; Location: " + getHeader(Response.LOCATION));
}
// Extract gzip, x-gzip and deflate content
if (content != null) {
// check if we have to uncompress it
String contentEncoding = headers.get(Response.CONTENT_ENCODING);
if (contentEncoding != null && Http.LOG.isTraceEnabled())
fetchTrace.append("; Content-Encoding: " + contentEncoding);
if ("gzip".equals(contentEncoding) ||
"x-gzip".equals(contentEncoding)) {
content = http.processGzipEncoded(content, url);
if (Http.LOG.isTraceEnabled())
fetchTrace.append("; extracted to " + content.length + " bytes");
} else if ("deflate".equals(contentEncoding)) {
content = http.processDeflateEncoded(content, url);
if (Http.LOG.isTraceEnabled())
fetchTrace.append("; extracted to " + content.length + " bytes");
}
}
// Log trace message
if (Http.LOG.isTraceEnabled()) {
Http.LOG.trace(fetchTrace);
}
} finally {
get.releaseConnection();
}
}
/* ------------------------- *
* <implementation:Response> *
* ------------------------- */
public URL getUrl() {
return url;
}
public int getCode() {
return code;
}
public String getHeader(String name) {
return headers.get(name);
}
public Metadata getHeaders() {
return headers;
}
public byte[] getContent() {
return content;
}
/* -------------------------- *
* </implementation:Response> *
* -------------------------- */
}