package uk.bl.crawling; import java.io.IOException; import java.net.URI; import java.net.URISyntaxException; import java.net.URLDecoder; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.jsoup.Connection; import org.jsoup.Connection.Method; import org.jsoup.Connection.Response; import org.jsoup.Jsoup; import org.jsoup.nodes.Element; import org.w3c.dom.Node; import org.w3c.dom.NodeList; import controllers.WaybackController; import models.Document; import models.WatchedTarget; import play.Logger; import play.Play; import play.libs.F.Function; import play.libs.F.Promise; import play.libs.XPath; import play.libs.ws.WS; import play.libs.ws.WSRequestHolder; import play.libs.ws.WSResponse; import uk.bl.documents.MetadataExtractor; public class Crawler { private Set<String> knownSites; private List<Document> foundDocuments; private Integer maxDocuments; private String crawlTime; private boolean crawlWayback = false; private static String waybackUrl = WaybackController.getWaybackEndpoint(); private Map<String, MetadataExtractor> metadataExtractors; public Crawler(boolean crawlWayback) { this.crawlWayback = crawlWayback; metadataExtractors = new HashMap<>(); metadataExtractors.put("www.ifs.org.uk", new MetadataExtractor("*[itemtype=http://schema.org/CreativeWork] *[itemprop=name]", "*[itemtype=http://schema.org/CreativeWork] *[itemprop=datePublished]", "*[itemtype=http://schema.org/CreativeWork] *[itemprop=author]")); metadataExtractors.put("www.gov.uk", new MetadataExtractor("h1", null, null)); } private static class CaptureRequestFunction implements Function<WSResponse, List<String>> { private String waybackTimestamp; public CaptureRequestFunction(String waybackTimestamp) { this.waybackTimestamp = waybackTimestamp; } @Override public List<String> apply(WSResponse response) { List<String> timestamps = new ArrayList<>(); try { org.w3c.dom.Document xml = response.asXml(); if (xml != null) { NodeList nodes = XPath.selectNodes("/wayback/results/result/capturedate", xml); for (int i=0; i < nodes.getLength(); i++) { Node node = nodes.item(i); if (waybackTimestamp == null || node.getTextContent().compareTo(waybackTimestamp) > 0) timestamps.add(node.getTextContent()); } } } catch (Exception e) { Logger.error("Can't get timestamps via the Wayback API: " + e.getMessage()); } return timestamps; } } public static List<String> getNewerCrawlTimes(WatchedTarget watchedTarget) { WSRequestHolder holder = WS.url(waybackUrl + "xmlquery") .setQueryParameter("type", "urlquery") .setQueryParameter("url", watchedTarget.target.fieldUrls.get(0).url); if (watchedTarget.waybackTimestamp != null) holder.setQueryParameter("startdate", watchedTarget.waybackTimestamp); Promise<List<String>> timestampPromise = holder.get().map( new CaptureRequestFunction(watchedTarget.waybackTimestamp)); return timestampPromise.get(5000); } public List<Document> crawlForDocuments(WatchedTarget watchedTarget, String crawlTime, int depth, Integer maxDocuments) { Logger.debug("crawlForDocuments"); knownSites = new HashSet<>(); foundDocuments = new ArrayList<>(); this.crawlTime = crawlTime; this.maxDocuments = maxDocuments; String seedUrl = crawlWayback ? waybackReplayUrl(watchedTarget.target.fieldUrls.get(0).url, crawlTime) : watchedTarget.target.fieldUrls.get(0).url; knownSites.add(seedUrl); Set<Link> fringe = new HashSet<>(); fringe.add(new Link(null, seedUrl)); breathFirstSearch(watchedTarget, fringe, depth); return foundDocuments; } private void breathFirstSearch (WatchedTarget watchedTarget, Set<Link> fringe, int linkDepth) { Logger.debug("breathFirstSearch"); if (linkDepth < -1 || (linkDepth == -1 && !foundDocuments.isEmpty())) return; Set<Link> children = new HashSet<>(); for (Link link : fringe) { try { if (linkDepth >= 0 || urlMatchesScheme(link.target, watchedTarget.documentUrlScheme)) { Response response = getResponse(link.target); String pageUrl = crawlWayback ? urlFromWayback(link.target) : link.target; if (response.contentType().contains("html")) { if (linkDepth >= 0) { org.jsoup.nodes.Document doc = response.parse(); for(Element element : doc.select("a[href]")) { String waybackHrefUrl = element.absUrl("href").replace(" ", "%20"); String hrefUrl = crawlWayback ? urlFromWayback(waybackHrefUrl) : waybackHrefUrl; if (hrefUrl != null && !knownSites.contains(hrefUrl)) { if (hrefUrl.endsWith(".pdf")) { if (urlMatchesScheme(hrefUrl, watchedTarget.documentUrlScheme)) { knownSites.add(hrefUrl); Logger.debug("pdf found: " + hrefUrl + " (via " + link.target + ")"); Document document = new Document(); document.landingPageUrl = pageUrl; document.documentUrl = hrefUrl; document.waybackTimestamp = crawlTime; document.setStatus(Document.Status.NEW); document.filename = URLDecoder.decode(hrefUrl.substring(hrefUrl.lastIndexOf('/')+1), "UTF-8"); document.title = document.filename.substring(0, document.filename.indexOf('.')); document.watchedTarget = watchedTarget; document.fastSubjects = watchedTarget.fastSubjects; extractMetadata(document); foundDocuments.add(document); if (maxDocuments != null && foundDocuments.size() >= maxDocuments) return; } } else if(domainIsEqual(pageUrl, hrefUrl)) { knownSites.add(hrefUrl); children.add(new Link(link.target, waybackHrefUrl)); } } } } } else if (urlMatchesScheme(pageUrl, watchedTarget.documentUrlScheme)) { String contentDisposition = response.header("Content-Disposition"); contentDisposition = contentDisposition == null ? "" : contentDisposition.replace("\"", ""); String contentType = response.header("Content-Type"); if (contentType.equals("application/pdf") || contentDisposition.endsWith(".pdf")) { Document document = new Document(); document.landingPageUrl = crawlWayback ? urlFromWayback(link.source) : link.source; document.documentUrl = pageUrl; document.waybackTimestamp = crawlTime; document.setStatus(Document.Status.NEW); if (contentType.equals("application/pdf")) document.filename = URLDecoder.decode(pageUrl.substring(pageUrl.lastIndexOf('/')+1), "UTF-8"); else document.filename = contentDisposition.substring(contentDisposition.lastIndexOf('=')+1); document.title = document.filename.substring(0, document.filename.indexOf('.')); document.watchedTarget = watchedTarget; document.fastSubjects = watchedTarget.fastSubjects; Logger.debug("hidden pdf found: " + document.filename + " (url: " + pageUrl + ")"); foundDocuments.add(document); if (maxDocuments != null && foundDocuments.size() >= maxDocuments) return; } } } } catch (IOException e) { Logger.info("Can't get content of url: " + link.target); e.printStackTrace(); } } breathFirstSearch(watchedTarget, children, linkDepth - 1); } public void extractMetadata(Document document) { try { String domain = new URI(document.landingPageUrl).getHost(); if (metadataExtractors.containsKey(domain)) { MetadataExtractor metadataExtractor = metadataExtractors.get(domain); String wbu = waybackReplayUrl(document.landingPageUrl, document.waybackTimestamp); org.jsoup.nodes.Document doc = Jsoup.connect(wbu).get(); metadataExtractor.extract(document, doc); } } catch (IOException | URISyntaxException e) { e.printStackTrace(); } } private Response getResponse(String url) throws IOException { Logger.debug("getResponse: " + url); Connection connection = Jsoup.connect(url); connection.request().method(Method.GET); connection.ignoreContentType(true); connection.execute(); return connection.response(); } private String waybackReplayUrl(String url, String timestamp) { return waybackUrl + "replay?url=" + url + "&date=" + timestamp; } private String urlFromWayback(String waybackUrl) { Pattern urlPattern1 = Pattern.compile("^[^?]+\\?.*url=([^&]+).*$"); Pattern urlPattern2 = Pattern.compile("^[^/]+//.*/([^/]+//.*)$"); Matcher matcher1 = urlPattern1.matcher(waybackUrl); if (matcher1.matches()) return matcher1.group(1); Matcher matcher2 = urlPattern2.matcher(waybackUrl); if (matcher2.matches()) return matcher2.group(1); return null; } private boolean domainIsEqual(String url, String targetUrl) { if (targetUrl.split("/").length <= 2) return false; return url.split("/")[2].equals(targetUrl.split("/")[2]); } private boolean urlMatchesScheme(String url, String scheme) { String urlWithoutProtocol = url.substring(url.indexOf("//") + 2); return urlWithoutProtocol.startsWith(scheme); } }