package com.netifera.platform.net.http.tools; import java.io.IOException; import java.net.URI; import java.util.ArrayList; import java.util.HashSet; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Queue; import java.util.Set; import org.apache.http.Header; import org.apache.http.HttpEntity; import org.apache.http.HttpRequest; import org.apache.http.HttpResponse; import org.apache.http.message.BasicHttpRequest; import org.apache.http.nio.protocol.HttpRequestExecutionHandler; import org.apache.http.nio.reactor.SessionRequest; import org.apache.http.nio.reactor.SessionRequestCallback; import org.apache.http.protocol.HttpContext; import org.apache.http.util.EntityUtils; import com.netifera.platform.api.tools.IToolContext; import com.netifera.platform.net.dns.model.EmailAddressEntity; import com.netifera.platform.net.http.internal.tools.Activator; import com.netifera.platform.net.http.service.AsynchronousHTTPClient; import com.netifera.platform.net.http.service.HTTP; import com.netifera.platform.net.http.service.html.WebLink; import com.netifera.platform.net.http.service.html.WebPage; import com.netifera.platform.util.addresses.inet.InternetAddress; import com.netifera.platform.util.locators.TCPSocketLocator; import com.netifera.platform.util.patternmatching.InternetAddressMatcher; public class WebSpider { final private HTTP http; private boolean followLinks = true; private boolean fetchImages = false; private int maximumConnections = 5; private URI base;// = URI.create("http:///"); private String hostname = null; private final Set<String> knownSites = new HashSet<String>(); private final Set<String> knownPaths = new HashSet<String>(); private final Queue<URI> urlsQueue = new LinkedList<URI>(); private long realm; private IToolContext toolContext; private volatile int successCount = 0; private volatile int errorsCount = 0; private volatile boolean interrupted = false; class WebSpiderWorker implements HttpRequestExecutionHandler { private URI url = null; public void initalizeContext(final HttpContext context, final Object attachment) { // if (base != null && base.getHost() != null) // context.setAttribute(ExecutionContext.HTTP_TARGET_HOST, new HttpHost(base.getHost())); // else // context.setAttribute(ExecutionContext.HTTP_TARGET_HOST, new HttpHost("")); } public void finalizeContext(final HttpContext context) { if (context.getAttribute("url")!=null) { errorsCount += 1; retryURL((URI)context.getAttribute("url")); } } public HttpRequest submitRequest(final HttpContext context) { if (interrupted || isExceededErrorThreshold()) { return null; } url = nextURLOrNull(); if (url == null) { toolContext.debug("No more requests to submit"); return null; // no new request to submit } String page = url.getRawPath(); if (page.length() == 0) page = "/"; if (url.getRawQuery() != null) page += "?"+url.getRawQuery(); HttpRequest request = new BasicHttpRequest("GET", page); request.addHeader("Host", http.getURIHostPort(hostname)); context.setAttribute("request", request); context.setAttribute("url", url); return request; } public void handleResponse(final HttpResponse response, final HttpContext context) { successCount += 1; HttpEntity entity = response.getEntity(); try { HttpRequest request = (HttpRequest) context.getAttribute("request"); int status = response.getStatusLine().getStatusCode(); URI url = (URI)context.getAttribute("url"); context.setAttribute("url", null); byte[] contentBytes = null; String content = null; if (status < 200 || status >= 400) { toolContext.error(request.getRequestLine()+" -> "+response.getStatusLine().toString()); if (status == 401) { Header header = response.getFirstHeader("WWW-Authenticate"); if (header != null) { String method = header.getValue().split(" ")[0]; if (method.toLowerCase().equals("basic")) { String authRealm = header.getValue().split("\"")[1]; toolContext.info("Basic authentication realm \""+authRealm+"\" at "+url); Activator.getInstance().getWebEntityFactory().createBasicAuthentication(realm, toolContext.getSpaceId(), http.getLocator(), url, authRealm); } } } } else { toolContext.info(request.getRequestLine()+" -> "+response.getStatusLine().toString()); } if (entity.getContentType() != null) { String contentType = entity.getContentType().getValue(); // is favicon? get it and add it to the model if (status == 200 && url.getPath().equals("/favicon.ico") && contentType.matches("image/x-icon|application/octet-stream|text/plain")) { entity.consumeContent(); int length = (int) entity.getContentLength(); if (length > 0) { contentBytes = new byte[length]; if (entity.getContent().read(contentBytes) == length) { Activator.getInstance().getWebEntityFactory().setFavicon(realm, toolContext.getSpaceId(), http.getLocator(), url, contentBytes); } } } else { if (status == 200) Activator.getInstance().getWebEntityFactory().createWebPage(realm, toolContext.getSpaceId(), http.getLocator(), url, contentType); if (contentType.matches("(text/|application/x-javascript).*")) { content = EntityUtils.toString(entity); WebPage page = new WebPage(url, content); if (followLinks) { for (WebLink link : page.links()) { if (interrupted) return; follow(link.url()); } } for (String email: page.emails()) { if (interrupted) return; EmailAddressEntity e = Activator.getInstance().getDomainEntityFactory().createEmailAddress(realm, 0, email); e.addTag(base.toString()); e.save(); e.addToSpace(toolContext.getSpaceId()); } } } } // redirect if (status >= 300) { Header locationHeader = response.getFirstHeader("Location"); if (locationHeader != null) { URI location = URI.create(locationHeader.getValue()); toolContext.warning("Redirect "+url+" to "+location); int port = location.getPort() == -1 ? 80 : location.getPort(); String hostname = location.getHost(); List<InternetAddress> addresses; if (InternetAddressMatcher.matches(hostname)) { addresses = new ArrayList<InternetAddress>(1); addresses.add(InternetAddress.fromString(hostname)); } else { addresses = Activator.getInstance().getNameResolver().getAddressesByName(hostname); } for (InternetAddress address : addresses) { Activator.getInstance().getWebEntityFactory().createWebSite(realm, toolContext.getSpaceId(), new TCPSocketLocator(address, port), hostname); } if (followLinks) follow(url.resolve(location)); } } // now attempt web service detection if (contentBytes == null) { if (content != null) { contentBytes = content.getBytes(); } else { if (entity.getContentLength() > 0 && entity.getContentLength() < 1024) { entity.consumeContent(); contentBytes = new byte[(int)entity.getContentLength()]; entity.getContent().read(contentBytes); // XXX if retval < contentLength content appears broken } else { contentBytes = new byte[0]; } } } detectWebService(url, request, response, contentBytes); } catch (IOException ex) { toolContext.exception("I/O error when handling response: " + ex.getMessage(), ex); } catch (Exception ex) { toolContext.exception("Error when handling response: " + ex.getMessage(), ex); } } } public void setRealm(long realm) { this.realm = realm; } public void setContext(IToolContext context) { this.toolContext = context; } public void setBaseURL(URI base) { this.base = base; if (hostname == null && base.getHost() != null && base.getHost().length()>0) hostname = base.getHost(); } public URI getBaseURL() { return base; } public void setHostName(String hostname) { this.hostname = hostname; this.base = URI.create(http.getURI(hostname)); } public void setFollowLinks(boolean followLinks) { this.followLinks = followLinks; } public void setFetchImages(boolean fetchImages) { this.fetchImages = fetchImages; } public void setMaximumConnections(int maximumConnections) { this.maximumConnections = maximumConnections; } public WebSpider(HTTP service) { this.http = service; } private void detectWebService(URI url, HttpRequest request, HttpResponse response, byte[] content) { // FIXME there must be a more efficient way to do this StringBuffer requestBuffer = new StringBuffer(); StringBuffer responseBuffer = new StringBuffer(); requestBuffer.append(request.getRequestLine()); requestBuffer.append('\n'); for (Header header: request.getAllHeaders()) { requestBuffer.append(header); requestBuffer.append('\n'); } responseBuffer.append(response.getStatusLine()); responseBuffer.append('\n'); for (Header header: response.getAllHeaders()) { responseBuffer.append(header); responseBuffer.append('\n'); } responseBuffer.append('\n'); responseBuffer.append(new String(content)); Map<String,String> serviceInfo = Activator.getInstance().getWebApplicationDetector().detect(requestBuffer.toString(), responseBuffer.toString()); if (serviceInfo != null) { Activator.getInstance().getWebEntityFactory().createWebApplication(realm, toolContext.getSpaceId(), http.getLocator(), url, serviceInfo); toolContext.info(serviceInfo.get("serviceType")+" detected at "+url); } } public void run() throws InterruptedException, IOException { interrupted = false; final AsynchronousHTTPClient client = http.createAsynchronousClient(new WebSpiderWorker()); try { while (!Thread.currentThread().isInterrupted()) { if (!hasNextURL()) { toolContext.debug("No next URL.. waiting.."); Thread.sleep(2000); } if (!hasNextURL()) Thread.sleep(5000); if (!hasNextURL()) Thread.sleep(5000); if (!hasNextURL()) { while (client.getConnectionsCount() >= maximumConnections && !isExceededErrorThreshold() && !Thread.currentThread().isInterrupted()) { toolContext.debug("Waiting, still "+client.getConnectionsCount()+" active connections"); Thread.sleep(1000); } break; } // XXX gracefully handle the case when cannot connect // for example blogspot, tripod or wikipedia from cn toolContext.debug(">>>> launch new client"); client.connect(new SessionRequestCallback() { public void cancelled(SessionRequest request) { errorsCount++; } public void completed(SessionRequest request) { successCount++; } public void failed(SessionRequest request) { if (request.getException() != null) { toolContext.error("Can not connect: " + request.getException().getMessage()); } errorsCount++; } public void timeout(SessionRequest request) { errorsCount++; } }); Thread.sleep(500); while (client.getConnectionsCount() >= maximumConnections && !isExceededErrorThreshold() && !Thread.currentThread().isInterrupted()) { toolContext.debug("Waiting, currently already "+client.getConnectionsCount()+" connections"); Thread.sleep(1000); } if (isExceededErrorThreshold()) { toolContext.error("Exceeded maximum number of errors: "+errorsCount+"/"+successCount); break; } } } finally { if (Thread.currentThread().isInterrupted()) interrupted = true; client.shutdown(); } } private boolean isExceededErrorThreshold() { return errorsCount > ((successCount + 1) * 5); } private synchronized boolean hasNextURL() { return !urlsQueue.isEmpty(); } private synchronized URI nextURLOrNull() { return urlsQueue.poll(); } private synchronized void follow(URI url) { url = url.normalize(); String path = url.getPath(); if (path == null) { toolContext.debug("Bad URL, null path: "+url); return; // bad url, like javascript:void } String host = url.getHost(); if (host == null) { toolContext.debug("Bad URL, null host: "+url); return; // bad url, like javascript:void } if (!fetchImages && path.matches(".*(jpg|gif|png)$")) return; // TODO improve "outside site" concept int basePort = base.getPort() == -1 ? 80 : base.getPort(); int urlPort = url.getPort() == -1 ? 80 : url.getPort(); // follow redirects only to subdomains if (!host.equals(base.getHost()) || basePort != urlPort) { toolContext.debug("Ignoring "+url+" (outside site)"); String site = url.resolve("/").toString(); knownSites.add(site); return; } addURL(url); } public synchronized void addURL(URI url) { String path = url.getPath(); if (path.length() == 0) path = "/"; if (knownPaths.contains(path)) return; //FIXME what if we want to send again the same request with different query parameters? knownPaths.add(path); urlsQueue.add(url); } public synchronized void retryURL(URI url) { toolContext.warning("Retrying "+url); urlsQueue.add(url); } }