/**
* villemos solutions [space^] (http://www.villemos.com)
* Probe. Send. Act. Emergent solution.
* Copyright 2011 Gert Villemos
* All Rights Reserved.
*
* Released under the Apache license, version 2.0 (do what ever
* you want, just dont claim ownership).
*
* NOTICE: All information contained herein is, and remains
* the property of villemos solutions, and its suppliers
* if any. The intellectual and technical concepts contained
* herein are proprietary to villemos solutions
* and its suppliers and may be covered by European and Foreign Patents,
* patents in process, and are protected by trade secret or copyright law.
*
* Dissemination of this information or reproduction of this material
* is strictly forbidden unless prior written permission is obtained
* from villemos solutions.
*
* And it wouldn't be nice either.
*
*/
package com.villemos.ispace.httpcrawler;
import java.io.IOException;
import java.io.InputStream;
import java.net.ProxySelector;
import java.net.URI;
import java.security.SecureRandom;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.net.ssl.SSLContext;
import javax.net.ssl.TrustManager;
import org.apache.camel.Endpoint;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.http.HttpEntity;
import org.apache.http.HttpHost;
import org.apache.http.HttpResponse;
import org.apache.http.auth.AuthScope;
import org.apache.http.auth.UsernamePasswordCredentials;
import org.apache.http.client.CookieStore;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpUriRequest;
import org.apache.http.client.params.ClientPNames;
import org.apache.http.client.params.CookiePolicy;
import org.apache.http.client.protocol.ClientContext;
import org.apache.http.conn.ClientConnectionManager;
import org.apache.http.conn.params.ConnRoutePNames;
import org.apache.http.conn.scheme.PlainSocketFactory;
import org.apache.http.conn.scheme.Scheme;
import org.apache.http.conn.scheme.SchemeRegistry;
import org.apache.http.conn.scheme.SocketFactory;
import org.apache.http.conn.ssl.SSLSocketFactory;
import org.apache.http.impl.client.BasicCookieStore;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.impl.conn.ProxySelectorRoutePlanner;
import org.apache.http.impl.conn.SingleClientConnManager;
import org.apache.http.params.BasicHttpParams;
import org.apache.http.params.HttpParams;
import org.apache.http.protocol.BasicHttpContext;
import org.apache.http.protocol.ExecutionContext;
import org.apache.http.protocol.HttpContext;
// import com.villemos.ispace.api.InformationObject;
public class HttpAccessor {
private static final Log LOG = LogFactory.getLog(HttpAccessor.class);
protected DefaultHttpClient client = null;
protected CookieStore cookieStore = new BasicCookieStore();
protected boolean ignoreAuthenticationFailure = true;
protected long processed = 0;
protected long failed = 0;
protected List<String> crawledPages = new ArrayList<String>();
protected Set<String> uncrawledPages = new HashSet<String>();
protected List<String> ignoredPages = new ArrayList<String>();
protected List<String> failedPages = new ArrayList<String>();
protected Pattern urlPattern = Pattern.compile("<a href=(\"|\')(.*?)(\'|\")");
protected HttpHost target = null;
protected HttpContext localContext = null;
protected Endpoint endpoint = null;
protected HttpCrawlerConsumer consumer = null;
protected HttpAccessor(Endpoint endpoint) {
this.endpoint = endpoint;
}
protected HttpAccessor(Endpoint endpoint, HttpCrawlerConsumer consumer) {
this.endpoint = endpoint;
this.consumer = consumer;
}
public HttpCrawlerEndpoint getEndpoint() {
return (HttpCrawlerEndpoint) endpoint;
}
protected HttpCrawlerEndpoint getHttpCrawlerEndpoint() {
return ((HttpCrawlerEndpoint) getEndpoint());
}
public int poll() throws Exception {
/** Always ignore authentication protocol errors. */
if (ignoreAuthenticationFailure) {
SSLContext sslContext = SSLContext.getInstance("SSL");
// set up a TrustManager that trusts everything
sslContext.init(null, new TrustManager[] {new EasyX509TrustManager()}, new SecureRandom());
SchemeRegistry schemeRegistry = new SchemeRegistry();
SSLSocketFactory sf = new SSLSocketFactory(sslContext);
Scheme httpsScheme = new Scheme("https", sf, 443);
schemeRegistry.register(httpsScheme);
SocketFactory sfa = new PlainSocketFactory();
Scheme httpScheme = new Scheme("http", sfa, 80);
schemeRegistry.register(httpScheme);
HttpParams params = new BasicHttpParams();
ClientConnectionManager cm = new SingleClientConnManager(params, schemeRegistry);
client = new DefaultHttpClient(cm, params);
}
else {
client = new DefaultHttpClient();
}
String proxyHost = getHttpCrawlerEndpoint().getProxyHost();
Integer proxyPort = getHttpCrawlerEndpoint().getProxyPort();
if (proxyHost != null && proxyPort != null) {
HttpHost proxy = new HttpHost(proxyHost, proxyPort);
client.getParams().setParameter(ConnRoutePNames.DEFAULT_PROXY, proxy);
}
else {
ProxySelectorRoutePlanner routePlanner = new ProxySelectorRoutePlanner(
client.getConnectionManager().getSchemeRegistry(),
ProxySelector.getDefault());
client.setRoutePlanner(routePlanner);
}
/** The target location may demand authentication. We setup preemptive authentication. */
if (getHttpCrawlerEndpoint().getAuthenticationUser() != null && getHttpCrawlerEndpoint().getAuthenticationPassword() != null) {
client.getCredentialsProvider().setCredentials(
new AuthScope(getHttpCrawlerEndpoint().getDomain(), getHttpCrawlerEndpoint().getPort()),
new UsernamePasswordCredentials(getHttpCrawlerEndpoint().getAuthenticationUser(), getHttpCrawlerEndpoint().getAuthenticationPassword()));
}
/** Set default cookie policy and store. Can be overridden for a specific method using for example;
* method.getParams().setParameter(ClientPNames.COOKIE_POLICY, CookiePolicy.BROWSER_COMPATIBILITY);
*/
client.setCookieStore(cookieStore);
client.getParams().setParameter(ClientPNames.COOKIE_POLICY, CookiePolicy.BEST_MATCH);
String uriStr = getHttpCrawlerEndpoint().getProtocol() + "://" + getHttpCrawlerEndpoint().getDomain();
if (getHttpCrawlerEndpoint().getPort() != 80) {
uriStr += ":" + getHttpCrawlerEndpoint().getPort() + "" + getHttpCrawlerEndpoint().getPath();
}
else {
uriStr += getHttpCrawlerEndpoint().getPath();
}
URI uri = new URI(uriStr);
if (getHttpCrawlerEndpoint().getPort() != 80) {
target = new HttpHost(getHttpCrawlerEndpoint().getDomain(), getHttpCrawlerEndpoint().getPort(), getHttpCrawlerEndpoint().getProtocol());
}
else {
target = new HttpHost(getHttpCrawlerEndpoint().getDomain());
}
localContext = new BasicHttpContext();
localContext.setAttribute(ClientContext.COOKIE_STORE, cookieStore);
/** Default boundary is the domain. */
getHttpCrawlerEndpoint().getBoundaries().add(getHttpCrawlerEndpoint().getProtocol() + "://" + getHttpCrawlerEndpoint().getDomain());
HttpUriRequest method = createInitialRequest(uri);
HttpResponse response = client.execute(target, method, localContext);
if (response.getStatusLine().getStatusCode() == 200) {
processSite(uri, response);
}
else if (response.getStatusLine().getStatusCode() == 302) {
HttpHost target = (HttpHost) localContext.getAttribute(ExecutionContext.HTTP_TARGET_HOST);
HttpGet get = new HttpGet(target.toURI());
// HttpGet get = new HttpGet("https://om.eo.esa.int/oem/kt/dashboard.php");
/** Read the response fully, to clear it. */
HttpEntity entity = response.getEntity();
HttpClientConfigurer.readFully(entity.getContent());
response = client.execute(target, get, localContext);
processSite(uri, response);
System.out.println("Final target: " + target);
}
else {
HttpEntity entity = response.getEntity();
InputStream instream = entity.getContent();
System.out.println(HttpClientConfigurer.readFully(instream));
}
return 0;
}
/**
* Default method for processing a site, with no special processing. URLs will be detected
* and iterativly parsed.
*
* Override this method to process a specific site.
*
* @param method
* @throws IOException
*/
protected void processSite(URI uri, HttpResponse response) throws IOException {
/** read the complete page. */
String page = HttpClientConfigurer.readFully(response.getEntity().getContent());
/** Detect URLs */
detectUrls(page);
/** Index this page*/
submitPage(uri.toString(), page);
/** Process all URLs */
while (uncrawledPages.size() > 0) {
String newUrl = uncrawledPages.iterator().next();
LOG.info("Crawling url " + newUrl + ". Processed " + crawledPages.size() + "/" + (uncrawledPages.size() + crawledPages.size()) + ".");
uncrawledPages.remove(newUrl);
/** Register this URL as crawled. We do this before we crawl, as no matter whether we succeed or not in
* the crawl of the page, we should not crawl the page again. */
crawledPages.add(newUrl);
processUrl(newUrl);
}
}
/**
* Method to create the first get call. Per default this simply get the
* front page. However the method can be overridden to provide more advanced
* processing, such as submitting an initial form with tokens.
*
* @param uri
* @return
*/
protected HttpUriRequest createInitialRequest(URI uri) {
return new HttpGet(uri.toString());
}
public void processUrl(String url) {
/** Get the page. */
String page = "";
int status = 0;
try {
HttpGet get = new HttpGet(url);
HttpResponse response = client.execute(target, get, localContext);
HttpEntity entity = response.getEntity();
if (entity != null) {
page = HttpClientConfigurer.readFully(entity.getContent());
}
else {
System.out.println(HttpClientConfigurer.readFully(entity.getContent()));
}
} catch(Exception e) {
e.printStackTrace();
failedPages.add(url);
}
/** Detect URLs */
detectUrls(page);
/** Index this page*/
submitPage(url, page);
}
protected void detectUrls(String page) {
Matcher matcher = urlPattern.matcher(page);
while (matcher.find() == true) {
String entryUrl = matcher.group(2).trim();
/** Add the full address if relative address. */
if (entryUrl.contains("http") == false) {
if (entryUrl.startsWith("/") == false) {
entryUrl = "/" + entryUrl;
}
if (getHttpCrawlerEndpoint().getPath().equals("") == false) {
entryUrl = getHttpCrawlerEndpoint().getProtocol() + "://" + getHttpCrawlerEndpoint().getDomain() + "/" + getHttpCrawlerEndpoint().getPath() + entryUrl;
}
else {
entryUrl = getHttpCrawlerEndpoint().getProtocol() + "://" + getHttpCrawlerEndpoint().getDomain() + entryUrl;
}
}
/** Ignore URLs in javascripts*/
if (entryUrl.contains("javascript") == true) {
ignoredPages.add(entryUrl);
continue;
}
/** Check it against the boundaries of the crawl. */
boolean within = false;
for (String boundary : getHttpCrawlerEndpoint().getBoundaries()) {
if (entryUrl.startsWith(boundary) == true) {
within = true;
break;
}
}
if (within == false) {
ignoredPages.add(entryUrl);
continue;
}
/** Has it already been crawled? */
boolean found = false;
for (String entry : crawledPages) {
if (entry.equals(entryUrl) == true) {
found = true;
break;
}
}
if (found == false) {
if (uncrawledPages.contains(entryUrl) == false) {
uncrawledPages.add(entryUrl);
}
}
}
}
protected void submitPage(String url, String page) {
if (consumer != null) {
consumer.submitPage(url, page);
}
}
}