package uk.bl.crawling; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import java.text.SimpleDateFormat; import java.util.Date; import java.util.List; import java.util.concurrent.TimeUnit; import com.avaje.ebean.Ebean; import controllers.Documents; import controllers.TargetController; import controllers.WatchedTargets; import models.Document; import models.WatchedTarget; import akka.actor.UntypedActor; import play.Logger; import play.libs.Akka; import play.libs.F.Function0; import play.libs.F.Promise; import scala.concurrent.duration.Duration; public class CrawlActor extends UntypedActor { public static class CrawlMessage {} public static class ConvertMessage {} private class CrawlFunction implements Function0<List<Document>> { private WatchedTarget watchedTarget; public CrawlFunction(WatchedTarget watchedTarget) { this.watchedTarget = watchedTarget; } @Override public List<Document> apply() { Logger.info("Crawling " + watchedTarget.target.fieldUrls.get(0).url); List<String> newerCrawlTimes = Crawler.getNewerCrawlTimes(watchedTarget); Logger.debug("got " + newerCrawlTimes.size() + " new crawl dates"); for (String crawlTime : newerCrawlTimes) crawlDocuments(watchedTarget, true, crawlTime, 2, null); Logger.info("Finished crawling " + watchedTarget.target.fieldUrls.get(0).url); return null; } } public static List<Document> crawlDocuments(WatchedTarget watchedTarget, boolean crawlWayback, String crawlTime, int depth, Integer maxDocuments) { Logger.debug("crawlDocuments of " + watchedTarget.target.fieldUrls.get(0).url + " (date: " + crawlTime + ")"); List<Document> documentList = (new Crawler(crawlWayback)).crawlForDocuments(watchedTarget, crawlTime, depth, maxDocuments); List<Document> newDocumentList = Documents.filterNew(documentList); if (documentList.isEmpty()) { TargetController.raiseFlag(watchedTarget.target, "No Documents Found"); } else { if (crawlWayback && (watchedTarget.waybackTimestamp == null || crawlTime.compareTo(watchedTarget.waybackTimestamp) > 0)) { WatchedTargets.setWaybackTimestamp(watchedTarget, crawlTime); } Ebean.save(newDocumentList); } return newDocumentList; } public static void crawlAndConvertDocuments(WatchedTarget watchedTarget, boolean crawlWayback, String crawlTime, int depth, Integer maxDocuments) { convertDocuments(crawlDocuments(watchedTarget, crawlWayback, crawlTime, depth, maxDocuments)); } public static void convertDocuments(List<Document> newDocumentList) { String ctphFile = new SimpleDateFormat("yyyyMMddHHmmss").format(new Date()) + ".txt"; for (Document document : newDocumentList) { try { int exitCode = convertPdfToHtml(document, ctphFile); boolean hashesExist = exitCode != 2; if (hashesExist) { if (exitCode != 0) Logger.error("can't convert document " + document.documentUrl + " to html"); Documents.addHashes(document); } else { Logger.error("can't download document " + document.documentUrl); } } catch (Exception e) { Logger.error("error while converting document " + document.documentUrl + " to html", e); } } try { compareHashes(ctphFile); Documents.addDuplicateAlert(ctphFile); } catch (Exception e) { Logger.error("can't compare ctp hashes of " + ctphFile, e); } } public static void convertDocuments() { List<Document> newDocumentList = Document.find.where().eq("sha256hash", null).findList(); convertDocuments(newDocumentList); } private static int convertPdfToHtml(Document document, String ctphFile) throws IOException, InterruptedException { ProcessBuilder builder = new ProcessBuilder("/bin/bash", "-c", "cd conf/converter && ./convertPdfToHtml.sh '" + document.actualSourceUrl() + "' '" + document.id + "' " + ctphFile); builder.redirectErrorStream(true); Process p = builder.start(); BufferedReader r = new BufferedReader(new InputStreamReader(p.getInputStream())); String line; while (true) { line = r.readLine(); if (line == null) { break; } Logger.debug(line); } return p.waitFor(); } private static void compareHashes(String ctphFile) throws IOException, InterruptedException { ProcessBuilder builder = new ProcessBuilder("/bin/bash", "-c", "cd conf/converter && ./compareHashes.sh " + ctphFile); Process p = builder.start(); if (p.waitFor() != 0) { throw new IOException(); } } @Override public void onReceive(Object message) throws Exception { if (message instanceof CrawlMessage) { Logger.info("Starting crawl"); List<WatchedTarget> watchedTargets = WatchedTarget.find.all(); for (WatchedTarget watchedTarget : watchedTargets) { Promise.promise(new CrawlFunction(watchedTarget)); } Akka.system().scheduler().scheduleOnce( Duration.create(1, TimeUnit.HOURS), getSelf(), new CrawlActor.ConvertMessage(), Akka.system().dispatcher(), null ); } else if (message instanceof ConvertMessage) { Logger.info("Convert documents"); convertDocuments(); } } }