/****************************************************** * Web crawler * * * Copyright (C) 2012 by Peter Hedenskog (http://peterhedenskog.com) * ****************************************************** * * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except * in compliance with the License. You may obtain a copy of the License at * * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software distributed under the License * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express * or implied. See the License for the specific language governing permissions and limitations under * the License. * ******************************************************* */ package com.soulgalore.crawler.core; import java.util.Map; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; /** * The response for a html page. * */ public class HTMLPageResponse { private static final int NO_HTTP_PORT = -1; private final Document doc; private final String encoding; private final CrawlerURL url; private final int responseCode; private final String responseType; private final Map<String, String> headers; private final long fetchTime; /** * Create a response. * * @param pageUrl the url * @param theResponseCode the response code * @param theHeaders the headers * @param theBody the body * @param theEncoding the encoding * @param theSize the size * @param theResponseType the response mime type * @param theFetchTime the time it took to fetch the response */ public HTMLPageResponse(CrawlerURL pageUrl, int theResponseCode, Map<String, String> theHeaders, String theBody, String theEncoding, long theSize, String theResponseType, long theFetchTime) { encoding = theEncoding; url = pageUrl; responseCode = theResponseCode; responseType = theResponseType; headers = theHeaders; fetchTime = theFetchTime; // special hack: // if the path contains a . (.html etc) then use the full path, // // relative links using ../ get's confused if the path don't // ends with an / if (!pageUrl.isWrongSyntax()) { final String baseUri = pageUrl.getUri().getScheme() + "://" + pageUrl.getUri().getHost() + ((pageUrl.getUri().getPort() != NO_HTTP_PORT) ? ":" + pageUrl.getUri().getPort() : "") + ((pageUrl.getUri().getPath().contains(".")) ? pageUrl.getUri().getPath() : pageUrl .getUri().getPath() + (pageUrl.getUri().getPath().endsWith("/") ? "" : "/")); // OK, here's a true story, There are some plugins (WP?) that create a href tags with a return instead // of a space between the a and the href. Lets catch them in this ugly way if (theBody.contains("<ahref")) { theBody = theBody.replaceAll("<ahref", "<a href"); } doc = Jsoup.parse(theBody, baseUri); } else { doc = null; } } public String getEncoding() { return encoding; } public Document getBody() { return doc; } public String getUrl() { return url.getUrl(); } public String getResponseType() { return responseType; } public CrawlerURL getPageUrl() { return url; } public int getResponseCode() { return responseCode; } public Map<String, String> getResponseHeaders() { return headers; } public String getHeaderValue(String key) { return headers.get(key); } public long getFetchTime() { return fetchTime; } @Override public String toString() { // left out the body & headers for now return this.getClass().getSimpleName() + "url:" + getUrl() + "responseCode:" + getResponseCode() + "encoding:" + encoding + " type:" + responseType; } @Override public int hashCode() { final int prime = 31; int result = 1; result = prime * result + responseCode; result = prime * result + ((responseType == null) ? 0 : responseType.hashCode()); result = prime * result + ((url == null) ? 0 : url.hashCode()); return result; } @Override public boolean equals(Object obj) { if (this == obj) return true; if (obj == null) return false; if (getClass() != obj.getClass()) return false; HTMLPageResponse other = (HTMLPageResponse) obj; if (responseCode != other.responseCode) return false; if (responseType == null) { if (other.responseType != null) return false; } else if (!responseType.equals(other.responseType)) return false; if (url == null) { if (other.url != null) return false; } else if (!url.equals(other.url)) return false; return true; } }