package uk.bl.crawling;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URLDecoder;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.jsoup.Connection;
import org.jsoup.Connection.Method;
import org.jsoup.Connection.Response;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import controllers.WaybackController;
import models.Document;
import models.WatchedTarget;
import play.Logger;
import play.Play;
import play.libs.F.Function;
import play.libs.F.Promise;
import play.libs.XPath;
import play.libs.ws.WS;
import play.libs.ws.WSRequestHolder;
import play.libs.ws.WSResponse;
import uk.bl.documents.MetadataExtractor;
public class Crawler {
private Set<String> knownSites;
private List<Document> foundDocuments;
private Integer maxDocuments;
private String crawlTime;
private boolean crawlWayback = false;
private static String waybackUrl = WaybackController.getWaybackEndpoint();
private Map<String, MetadataExtractor> metadataExtractors;
public Crawler(boolean crawlWayback) {
this.crawlWayback = crawlWayback;
metadataExtractors = new HashMap<>();
metadataExtractors.put("www.ifs.org.uk", new MetadataExtractor("*[itemtype=http://schema.org/CreativeWork] *[itemprop=name]",
"*[itemtype=http://schema.org/CreativeWork] *[itemprop=datePublished]",
"*[itemtype=http://schema.org/CreativeWork] *[itemprop=author]"));
metadataExtractors.put("www.gov.uk", new MetadataExtractor("h1", null, null));
}
private static class CaptureRequestFunction implements Function<WSResponse, List<String>> {
private String waybackTimestamp;
public CaptureRequestFunction(String waybackTimestamp) {
this.waybackTimestamp = waybackTimestamp;
}
@Override
public List<String> apply(WSResponse response) {
List<String> timestamps = new ArrayList<>();
try {
org.w3c.dom.Document xml = response.asXml();
if (xml != null) {
NodeList nodes = XPath.selectNodes("/wayback/results/result/capturedate", xml);
for (int i=0; i < nodes.getLength(); i++) {
Node node = nodes.item(i);
if (waybackTimestamp == null || node.getTextContent().compareTo(waybackTimestamp) > 0)
timestamps.add(node.getTextContent());
}
}
} catch (Exception e) {
Logger.error("Can't get timestamps via the Wayback API: " + e.getMessage());
}
return timestamps;
}
}
public static List<String> getNewerCrawlTimes(WatchedTarget watchedTarget) {
WSRequestHolder holder = WS.url(waybackUrl + "xmlquery")
.setQueryParameter("type", "urlquery")
.setQueryParameter("url", watchedTarget.target.fieldUrls.get(0).url);
if (watchedTarget.waybackTimestamp != null)
holder.setQueryParameter("startdate", watchedTarget.waybackTimestamp);
Promise<List<String>> timestampPromise = holder.get().map(
new CaptureRequestFunction(watchedTarget.waybackTimestamp));
return timestampPromise.get(5000);
}
public List<Document> crawlForDocuments(WatchedTarget watchedTarget, String crawlTime, int depth, Integer maxDocuments) {
Logger.debug("crawlForDocuments");
knownSites = new HashSet<>();
foundDocuments = new ArrayList<>();
this.crawlTime = crawlTime;
this.maxDocuments = maxDocuments;
String seedUrl = crawlWayback ?
waybackReplayUrl(watchedTarget.target.fieldUrls.get(0).url, crawlTime) :
watchedTarget.target.fieldUrls.get(0).url;
knownSites.add(seedUrl);
Set<Link> fringe = new HashSet<>();
fringe.add(new Link(null, seedUrl));
breathFirstSearch(watchedTarget, fringe, depth);
return foundDocuments;
}
private void breathFirstSearch (WatchedTarget watchedTarget, Set<Link> fringe, int linkDepth) {
Logger.debug("breathFirstSearch");
if (linkDepth < -1 || (linkDepth == -1 && !foundDocuments.isEmpty())) return;
Set<Link> children = new HashSet<>();
for (Link link : fringe) {
try {
if (linkDepth >= 0 || urlMatchesScheme(link.target, watchedTarget.documentUrlScheme)) {
Response response = getResponse(link.target);
String pageUrl = crawlWayback ?
urlFromWayback(link.target) : link.target;
if (response.contentType().contains("html")) {
if (linkDepth >= 0) {
org.jsoup.nodes.Document doc = response.parse();
for(Element element : doc.select("a[href]")) {
String waybackHrefUrl = element.absUrl("href").replace(" ", "%20");
String hrefUrl = crawlWayback ?
urlFromWayback(waybackHrefUrl) : waybackHrefUrl;
if (hrefUrl != null && !knownSites.contains(hrefUrl)) {
if (hrefUrl.endsWith(".pdf")) {
if (urlMatchesScheme(hrefUrl, watchedTarget.documentUrlScheme)) {
knownSites.add(hrefUrl);
Logger.debug("pdf found: " + hrefUrl + " (via " + link.target + ")");
Document document = new Document();
document.landingPageUrl = pageUrl;
document.documentUrl = hrefUrl;
document.waybackTimestamp = crawlTime;
document.setStatus(Document.Status.NEW);
document.filename = URLDecoder.decode(hrefUrl.substring(hrefUrl.lastIndexOf('/')+1), "UTF-8");
document.title = document.filename.substring(0, document.filename.indexOf('.'));
document.watchedTarget = watchedTarget;
document.fastSubjects = watchedTarget.fastSubjects;
extractMetadata(document);
foundDocuments.add(document);
if (maxDocuments != null && foundDocuments.size() >= maxDocuments) return;
}
} else if(domainIsEqual(pageUrl, hrefUrl)) {
knownSites.add(hrefUrl);
children.add(new Link(link.target, waybackHrefUrl));
}
}
}
}
} else if (urlMatchesScheme(pageUrl, watchedTarget.documentUrlScheme)) {
String contentDisposition = response.header("Content-Disposition");
contentDisposition = contentDisposition == null ?
"" : contentDisposition.replace("\"", "");
String contentType = response.header("Content-Type");
if (contentType.equals("application/pdf") || contentDisposition.endsWith(".pdf")) {
Document document = new Document();
document.landingPageUrl = crawlWayback ?
urlFromWayback(link.source) : link.source;
document.documentUrl = pageUrl;
document.waybackTimestamp = crawlTime;
document.setStatus(Document.Status.NEW);
if (contentType.equals("application/pdf"))
document.filename = URLDecoder.decode(pageUrl.substring(pageUrl.lastIndexOf('/')+1), "UTF-8");
else
document.filename = contentDisposition.substring(contentDisposition.lastIndexOf('=')+1);
document.title = document.filename.substring(0, document.filename.indexOf('.'));
document.watchedTarget = watchedTarget;
document.fastSubjects = watchedTarget.fastSubjects;
Logger.debug("hidden pdf found: " + document.filename + " (url: " + pageUrl + ")");
foundDocuments.add(document);
if (maxDocuments != null && foundDocuments.size() >= maxDocuments) return;
}
}
}
} catch (IOException e) {
Logger.info("Can't get content of url: " + link.target);
e.printStackTrace();
}
}
breathFirstSearch(watchedTarget, children, linkDepth - 1);
}
public void extractMetadata(Document document) {
try {
String domain = new URI(document.landingPageUrl).getHost();
if (metadataExtractors.containsKey(domain)) {
MetadataExtractor metadataExtractor = metadataExtractors.get(domain);
String wbu = waybackReplayUrl(document.landingPageUrl, document.waybackTimestamp);
org.jsoup.nodes.Document doc = Jsoup.connect(wbu).get();
metadataExtractor.extract(document, doc);
}
} catch (IOException | URISyntaxException e) {
e.printStackTrace();
}
}
private Response getResponse(String url) throws IOException {
Logger.debug("getResponse: " + url);
Connection connection = Jsoup.connect(url);
connection.request().method(Method.GET);
connection.ignoreContentType(true);
connection.execute();
return connection.response();
}
private String waybackReplayUrl(String url, String timestamp) {
return waybackUrl + "replay?url=" + url + "&date=" + timestamp;
}
private String urlFromWayback(String waybackUrl) {
Pattern urlPattern1 = Pattern.compile("^[^?]+\\?.*url=([^&]+).*$");
Pattern urlPattern2 = Pattern.compile("^[^/]+//.*/([^/]+//.*)$");
Matcher matcher1 = urlPattern1.matcher(waybackUrl);
if (matcher1.matches())
return matcher1.group(1);
Matcher matcher2 = urlPattern2.matcher(waybackUrl);
if (matcher2.matches())
return matcher2.group(1);
return null;
}
private boolean domainIsEqual(String url, String targetUrl) {
if (targetUrl.split("/").length <= 2) return false;
return url.split("/")[2].equals(targetUrl.split("/")[2]);
}
private boolean urlMatchesScheme(String url, String scheme) {
String urlWithoutProtocol = url.substring(url.indexOf("//") + 2);
return urlWithoutProtocol.startsWith(scheme);
}
}