package at.chille.crawler; import java.util.ArrayList; import java.util.Date; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import java.util.regex.Pattern; import org.apache.http.Header; import org.apache.http.HttpStatus; import org.apache.tika.metadata.Metadata; import at.chille.crawler.database.model.HostInfo; import at.chille.crawler.database.model.PageInfo; import edu.uci.ics.crawler4j.crawler.Page; import edu.uci.ics.crawler4j.crawler.WebCrawler; import edu.uci.ics.crawler4j.fetcher.CustomFetchStatus; import edu.uci.ics.crawler4j.fetcher.PageFetchResult; import edu.uci.ics.crawler4j.parser.HtmlParseData; import edu.uci.ics.crawler4j.parser.ParseData; import edu.uci.ics.crawler4j.url.WebURL; /** * @author chille * */ public class HttpAnalysisCrawler extends WebCrawler { private final static Pattern FILTERS = Pattern .compile(".*(\\.(css|js|bmp|gif|ico|jpe?g" + "|png|tiff?|mid|mp2|mp3|mp4" + "|wav|avi|mov|mpeg|ram|m4v|pdf" + "|rm|smil|wmv|swf|wma|zip|rar|gz))$"); protected HashSet<String> WHITELIST = new HashSet<String>(); protected int inspectionLimitPerHost = 1; protected long niceWaitTime = 500; // lower priorities will be fetched earlier! protected byte PRIORITY_WHITELIST = 10; protected byte PRIORITY_INSPECT_HTTP = 5; protected byte PRIORITY_INSPECT_HTTPS = 5; protected HashSet<String> interestingHeaders = new HashSet<String>(); protected HashSet<String> blacklistHeaders = new HashSet<String>(); protected Map<String, String> detectHTML = new HashMap<String, String>(); public HttpAnalysisCrawler() { for (String whitelist : StringFileReader.readLines("url-whitelist.txt")) { // System.out.println("URL: Whiteliste: "+whitelist); WHITELIST.add(whitelist.toLowerCase()); } for (String blacklist : StringFileReader .readLines("headers-blacklist.txt")) { // System.out.println("Header: Blacklist: "+blacklist); blacklistHeaders.add(blacklist.toLowerCase()); } detectHTML.put("googletagservices.com/tag/js/gpt.js".toLowerCase(), "Google Publisher Tag"); detectHTML.put( "google.com/recaptcha/api/js/recaptcha_ajax.js".toLowerCase(), "Google Recaptcha"); detectHTML.put("connect.facebook.net/de_DE/all.js".toLowerCase(), "Facebook Likes"); detectHTML.put("facebook.com/plugins/like.php".toLowerCase(), "Facebook Likes"); detectHTML.put("apis.google.com/js/plusone.js".toLowerCase(), "Google Plus"); detectHTML.put( "googlesyndication.com/pagead/show_ads.js".toLowerCase(), "Google Syndication"); detectHTML.put(".google-analytics.com/ga.js".toLowerCase(), "Google Analytics"); detectHTML .put("//ajax.googleapis.com".toLowerCase(), "Google Ajax API"); // not so good: detectHTML.put("<script type=\"text/javascript\">".toLowerCase(), "JSinHTML"); detectHTML.put("</style>".toLowerCase(), "CSSinHTML"); } /** * Counts the number of URLS in visited, that have the same protocol as the URL in current. Only * http:// and https:// are known * * @param visited * List of URLs to be counted * @param current * Url with a specific protocol. * @return */ public int countSimilarURLs(Set<String> visited, String current) { int count = 0; String protocol = null; if (current.startsWith("https://")) protocol = "https://"; else if (current.startsWith("http://")) protocol = "http://"; for (String url : visited) { // does the url has the same protocol? if (url.startsWith(protocol)) { // TODO: filter by folders? count += 1; } } return count; } /** * Decides if the given WebURL should be visited for Inspection. Returns true if the URL was not * visited yet and the number of URLs with the same Hostname and the same Protocol is less than * this.inspectionLimitPerHost. * * @param webUrl * @return true if the URL should be visited */ public boolean shouldVisitForInspection(WebURL webUrl) { String fullDomain = DatabaseManager.getFullDomain(webUrl); String href = webUrl.getURL().toLowerCase(); if (webUrl.getDomain().endsWith("at")) { DatabaseManager.getInstance().getHostLock(fullDomain).lock(); HostInfo hostInfo = DatabaseManager.getInstance().getHostInfo( fullDomain); if (hostInfo == null) { hostInfo = new HostInfo(); hostInfo.setHostName(fullDomain); hostInfo.getTodoUrls().add(href); DatabaseManager.getInstance().addHostInfo(hostInfo); DatabaseManager.getInstance().getHostLock(fullDomain).unlock(); return true; } if (hostInfo.getPages().containsKey(href)) { DatabaseManager.getInstance().getHostLock(fullDomain).unlock(); return false; } if (hostInfo.getTodoUrls().contains(href)) { DatabaseManager.getInstance().getHostLock(fullDomain).unlock(); return false; } int similarURLs = countSimilarURLs(hostInfo.getPages().keySet(), href); similarURLs += countSimilarURLs(hostInfo.getTodoUrls(), href); if (similarURLs < inspectionLimitPerHost) { hostInfo.getTodoUrls().add(href); hostInfo = DatabaseManager.getInstance().saveHostInfo(hostInfo); // System.out.println("o "+ webUrl.toString()); DatabaseManager.getInstance().getHostLock(fullDomain).unlock(); return true; } DatabaseManager.getInstance().getHostLock(fullDomain).unlock(); } return false; } @Override public boolean shouldVisit(WebURL url) { String href = url.getURL().toLowerCase(); if (FILTERS.matcher(href).matches()) // abort on images, ... return false; boolean returnvalue = false; for (String white : WHITELIST) { if (href.startsWith(white)) { returnvalue = true; url.setPriority(this.PRIORITY_WHITELIST); } } if (returnvalue == false) { returnvalue = shouldVisitForInspection(url); } // System.out.println(returnvalue+ " --> "+href); return returnvalue; } /** * Returns the time to wait before fetching the given URL. Must Return zero if we don't have to * Sleep. * * @param url * @return */ public long getNiceWaitTime(WebURL url) { String fullDomain = DatabaseManager.getFullDomain(url); HostInfo host = DatabaseManager.getInstance().getHostInfo(fullDomain); DatabaseManager.getInstance().getHostLock(fullDomain).lock(); if (host == null) { host = new HostInfo(); host.setHostName(DatabaseManager.getFullDomain(url)); host.setLastVisited(new Date().getTime()); DatabaseManager.getInstance().addHostInfo(host); host = DatabaseManager.getInstance().saveHostInfo(host); DatabaseManager.getInstance().getHostLock(fullDomain).unlock(); return 0; } long now = new Date().getTime(); long timeToWait = host.getLastVisited() + niceWaitTime - now; if (timeToWait <= 0) { host.setLastVisited(now); // here it is not important to store to the database. DatabaseManager.getInstance().getHostLock(fullDomain).unlock(); return 0; } else { // System.err.println("Waiting: " + host.getHostName() + ": " // + timeToWait); } // System.err.println("Check Nice: " + host.getHostName() + // ": now: " // + now + " this: " + thisVisit + " wait: " + timeToWait); DatabaseManager.getInstance().getHostLock(fullDomain).unlock(); return timeToWait; } @Override public void run() { onStart(); while (true) { int maxFetch = 20; // maxFetch = (int) (2 * frontier.getQueueLength() / // HttpAnalysisCrawlController.threads); // if (maxFetch > 20) { // maxFetch = 20; // } // if (maxFetch < 1) { // maxFetch = 1; // } List<WebURL> assignedURLs = new ArrayList<WebURL>(maxFetch); isWaitingForNewURLs = true; frontier.getNextURLs(maxFetch, assignedURLs); isWaitingForNewURLs = false; if (assignedURLs.size() == 0) { if (frontier.isFinished()) { return; } try { // wait for new urls (currently no urls available) Thread.sleep(3000); } catch (InterruptedException e) { e.printStackTrace(); } } else { while (assignedURLs.size() > 0) { WebURL curURL = assignedURLs.remove(0); if (curURL != null) { long wait = getNiceWaitTime(curURL); if (wait > 0) { try { if (assignedURLs.size() >= 1 && wait > 200) { // try another url before, add to queue.. // logger.info(curURL.getDomain() // + " Wait + Try other: " // + curURL.getDomain() // + " Assigned #" // + assignedURLs.size()); Thread.sleep(50); assignedURLs.add(curURL); continue; } else { // Really wait // logger.info(" Really Wait for " + wait // + " ms: " + curURL.getDomain() // + " Assigned #" // + assignedURLs.size()); Thread.sleep(wait); continue; } } catch (InterruptedException e) { e.printStackTrace(); } } processPage(curURL); frontier.setProcessed(curURL); } if (myController.isShuttingDown()) { logger.info("Exiting because of controller shutdown."); return; } } // for (WebURL curURL : assignedURLs) { // if (curURL != null) { // long timeToSleep = 0; // try { // while ((timeToSleep = getNiceWaitTime(curURL)) > 0) { // Thread.sleep(timeToSleep); // } // } catch (Exception ex) { // ex.printStackTrace(); // } // processPage(curURL); // frontier.setProcessed(curURL); // } // if (myController.isShuttingDown()) { // logger.info("Exiting because of controller shutdown."); // return; // } // } } } } @Override protected void processPage(WebURL curURL) { if (curURL == null) { return; } logger.info("trying to fetch: " + curURL.getURL().toLowerCase()); PageFetchResult fetchResult = null; try { fetchResult = pageFetcher.fetchHeader(curURL); int statusCode = fetchResult.getStatusCode(); handlePageStatusCode(curURL, statusCode, CustomFetchStatus.getStatusDescription(statusCode)); if (statusCode != HttpStatus.SC_OK) { if (statusCode == HttpStatus.SC_MOVED_PERMANENTLY || statusCode == HttpStatus.SC_MOVED_TEMPORARILY) { if (myController.getConfig().isFollowRedirects()) { String movedToUrl = fetchResult.getMovedToUrl(); if (movedToUrl == null) { return; } int newDocId = docIdServer.getDocId(movedToUrl); if (newDocId > 0) { // Redirect page is already seen return; } WebURL webURL = new WebURL(); webURL.setURL(movedToUrl); webURL.setParentDocid(curURL.getParentDocid()); webURL.setParentUrl(curURL.getParentUrl()); webURL.setDepth(curURL.getDepth()); webURL.setDocid(-1); webURL.setAnchor(curURL.getAnchor()); if (movedToUrl.startsWith("https://")) webURL.setPriority(this.PRIORITY_INSPECT_HTTPS); else webURL.setPriority(this.PRIORITY_INSPECT_HTTP); if (shouldVisit(webURL) && robotstxtServer.allows(webURL)) { webURL.setDocid(docIdServer.getNewDocID(movedToUrl)); frontier.schedule(webURL); } logger.info(" Moved: " + webURL.getURL()); // notify Database of redirect! String fullDomain = DatabaseManager .getFullDomain(webURL); DatabaseManager.getInstance().getHostLock(fullDomain) .lock(); HostInfo hostInfo = DatabaseManager.getInstance() .getHostInfo(fullDomain); if (hostInfo == null) { hostInfo = new HostInfo(); hostInfo.setHostName(fullDomain); DatabaseManager.getInstance().addHostInfo(hostInfo); } PageInfo page = new PageInfo(); page.setUrl(curURL.getURL().toLowerCase()); page.setAccessTime(new Date().getTime()); // at.chille.crawler.database.model.Header header = new // at.chille.crawler.database.model.Header(); // header.setName("HTTP-Status-Code"); // header.setValue(String.valueOf(statusCode)); // page.addHeader(header); hostInfo.getTodoUrls().remove( curURL.getURL().toLowerCase()); hostInfo.addPage(page); // Store to database hostInfo = DatabaseManager.getInstance().saveHostInfo( hostInfo); DatabaseManager.getInstance().getHostLock(fullDomain) .unlock(); } } else if (fetchResult.getStatusCode() == CustomFetchStatus.PageTooBig) { logger.info("Skipping a page which was bigger than max allowed size: " + curURL.getURL()); } return; } if (!curURL.getURL().equals(fetchResult.getFetchedUrl())) { if (docIdServer.isSeenBefore(fetchResult.getFetchedUrl())) { // Redirect page is already seen return; } curURL.setURL(fetchResult.getFetchedUrl()); curURL.setDocid(docIdServer.getNewDocID(fetchResult .getFetchedUrl())); } Page page = new Page(curURL); int docid = curURL.getDocid(); if (!fetchResult.fetchContent(page)) { onContentFetchError(curURL); return; } if (!parser.parse(page, curURL.getURL())) { onParseError(curURL); return; } ParseData parseData = page.getParseData(); if (parseData instanceof HtmlParseData) { HtmlParseData htmlParseData = (HtmlParseData) parseData; List<WebURL> toSchedule = new ArrayList<WebURL>(); int maxCrawlDepth = myController.getConfig() .getMaxDepthOfCrawling(); for (WebURL webURL : htmlParseData.getOutgoingUrls()) { // TODO: add urls with ascending path (remove folder) // www.google.at/a/b/c/d/ // www.google.at/a/b/c/ // www.google.at/a/b/ // www.google.at/a/ // www.google.at/ // always try https at least once // if(didnt *schedule* https yet for this domain) try https // also WebURL httpsURL = getHttpsURL(webURL); if (httpsURL != null) { httpsURL.setParentDocid(docid); httpsURL.setParentUrl(curURL.getURL()); int newdocid = docIdServer.getDocId(httpsURL.getURL()); if (newdocid <= 0) { httpsURL.setDepth((short) (curURL.getDepth() + 1)); httpsURL.setDocid(-1); httpsURL.setPriority(this.PRIORITY_INSPECT_HTTPS); if (maxCrawlDepth == -1 || httpsURL.getDepth() < maxCrawlDepth) { if (shouldVisit(httpsURL) && robotstxtServer.allows(httpsURL)) { // System.out.println("Adding additional 4 " // + httpsURL.getURL()); httpsURL.setDocid(docIdServer .getNewDocID(httpsURL.getURL())); toSchedule.add(httpsURL); // System.out.println(" + "+ // httpsURL.getURL()); } // else if(!robotstxtServer.allows(httpsURL)) { // System.out.println(" - "+ // httpsURL.getURL() + " (robots.txt)"); } // else { // System.out.println(" - "+ // httpsURL.getURL() + " (shouldVisit)"); } } } } // end of https schedule webURL.setParentDocid(docid); webURL.setParentUrl(curURL.getURL()); int newdocid = docIdServer.getDocId(webURL.getURL()); if (newdocid > 0) { // This is not the first time that this Url is // visited. So, we set the depth to a negative // number. webURL.setDepth((short) -1); webURL.setDocid(newdocid); // System.out.println(" - "+ webURL.getURL() + // " (newdocid > 0)"); } else { webURL.setDocid(-1); webURL.setDepth((short) (curURL.getDepth() + 1)); webURL.setPriority(this.PRIORITY_INSPECT_HTTP); if (maxCrawlDepth == -1 || curURL.getDepth() < maxCrawlDepth) { if (shouldVisit(webURL) && robotstxtServer.allows(webURL)) { webURL.setDocid(docIdServer.getNewDocID(webURL .getURL())); toSchedule.add(webURL); // System.out.println(" + "+ webURL.getURL()); } // else if(!robotstxtServer.allows(webURL)) { // System.out.println(" - "+ webURL.getURL() + // " (robots.txt)"); } // else { // System.out.println(" - "+ webURL.getURL() + // " (shouldVisit)"); } } else { // System.out.println(" - "+ webURL.getURL() + // " (Crawl depth)"); } } } frontier.scheduleAll(toSchedule); } try { visit(page); } catch (Exception e) { logger.error("Exception while running the visit method. Message: '" + e.getMessage() + "' at " + e.getStackTrace()[0]); } } catch (Exception e) { logger.error(e.getMessage() + ", while processing: " + curURL.getURL()); } finally { if (fetchResult != null) { fetchResult.discardContentIfNotConsumed(); } } } /** * Returns the same URL as given, but using the HTTPS-Protocol. Returns null if the URL is already * using the HTTPS Protocol. * * @param webURL * http or https URL * @return https Url or null */ private WebURL getHttpsURL(WebURL webURL) { if (webURL.getURL().startsWith("http://")) { WebURL newWebURL = new WebURL(); newWebURL.setURL("https://" + webURL.getURL().substring(7)); return newWebURL; } else if (webURL.getURL().startsWith("https://")) { return null; } logger.error("-- UNKNOWN PROTOCOL : " + webURL.getURL()); return null; } @Override public void visit(Page page) { WebURL webUrl = page.getWebURL(); // int docid = webUrl.getDocid(); String url = webUrl.getURL().toLowerCase(); // String domain = webUrl.getDomain(); // String path = webUrl.getPath(); // String subDomain = webUrl.getSubDomain(); // String parentUrl = webUrl.getParentUrl(); // String anchor = webUrl.getAnchor(); String fullDomain = DatabaseManager.getFullDomain(webUrl); PageInfo pageInfo = new PageInfo(); pageInfo.setUrl(url); DatabaseManager.getInstance().getHostLock(fullDomain).lock(); HostInfo hostInfo = DatabaseManager.getInstance().getHostInfo( fullDomain); if (hostInfo == null) { hostInfo = new HostInfo(); hostInfo.setHostName(fullDomain); DatabaseManager.getInstance().addHostInfo(hostInfo); // hostInfo.getTodoUrls().add(webUrl.getURL()); } int visitedPages = this.countSimilarURLs(hostInfo.getPages().keySet(), url); if (visitedPages > 1) // abort, not interesting any more { logger.debug("Already visited several pages of this webserver: '" + url + "'. Aborting."); DatabaseManager.getInstance().getHostLock(fullDomain).unlock(); return; } hostInfo.addPage(pageInfo); // System.out.println(domain+" "+ subDomain + " "+ path + " "); // * // System.out.println("Docid: " + docid); logger.debug(" Domain: '" + fullDomain + "'\tURL: " + url); // System.out.println("Domain: '" + domain + "'"); // System.out.println("Sub-domain: '" + subDomain + "'"); // System.out.println("Path: '" + path + "'"); // System.out.println("Parent page: " + parentUrl); // System.out.println("Anchor text: " + anchor); if (page.getParseData() instanceof HtmlParseData) { HtmlParseData htmlParseData = (HtmlParseData) page.getParseData(); // String text = htmlParseData.getText(); String html = htmlParseData.getHtml().toLowerCase(); Metadata meta = htmlParseData.getMetadata(); for (Map.Entry<String, String> pair : detectHTML.entrySet()) { if (html.contains(pair.getKey())) { System.out.println("Detected: " + pair.getValue()); at.chille.crawler.database.model.Header headerDB = new at.chille.crawler.database.model.Header(); headerDB.setName("Skript"); headerDB.setValue(pair.getValue()); pageInfo.addHeader(headerDB); } } String generator = meta.get("generator"); if (generator != null) { System.out.println("Generator: " + generator); at.chille.crawler.database.model.Header headerDB = new at.chille.crawler.database.model.Header(); headerDB.setName("META/Generator"); headerDB.setValue(generator); pageInfo.addHeader(headerDB); } // System.out.println("Text length: " + text.length()); // System.out.println("Html length: " + html.length()); // System.out.println("Number of outgoing links: " + // links.size()); } Header[] responseHeaders = page.getFetchResponseHeaders(); if (responseHeaders != null) { // System.out.println("Response headers:"); for (Header header : responseHeaders) { if (!blacklistHeaders.contains(header.getName().toLowerCase())) { // if (interestingHeaders.contains(header.getName())) { // System.out.println("\t ! " + header.getName() + ": " // + header.getValue()); at.chille.crawler.database.model.Header headerDB = new at.chille.crawler.database.model.Header(); headerDB.setName(header.getName()); headerDB.setValue(header.getValue()); pageInfo.addHeader(headerDB); } else { // irrelevant headers // System.out.println("\t " + header.getName() + ": " // + // header.getValue()); } } } hostInfo = DatabaseManager.getInstance().saveHostInfo(hostInfo); DatabaseManager.getInstance().getHostLock(fullDomain).unlock(); // System.out.println("============================================="); // */ } }