CrawlActor.java example

Explorer
w3act-master
- app
- test
package uk.bl.crawling;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.List;
import java.util.concurrent.TimeUnit;

import com.avaje.ebean.Ebean;

import controllers.Documents;
import controllers.TargetController;
import controllers.WatchedTargets;
import models.Document;
import models.WatchedTarget;
import akka.actor.UntypedActor;
import play.Logger;
import play.libs.Akka;
import play.libs.F.Function0;
import play.libs.F.Promise;
import scala.concurrent.duration.Duration;

public class CrawlActor extends UntypedActor {
	
	public static class CrawlMessage {}
	public static class ConvertMessage {}
    
	private class CrawlFunction implements Function0<List<Document>> {
		
		private WatchedTarget watchedTarget;
		
		public CrawlFunction(WatchedTarget watchedTarget) {
			this.watchedTarget = watchedTarget;
		}
		
		@Override
		public List<Document> apply() {
			Logger.info("Crawling " + watchedTarget.target.fieldUrls.get(0).url);
			List<String> newerCrawlTimes = Crawler.getNewerCrawlTimes(watchedTarget);
			Logger.debug("got " + newerCrawlTimes.size() + " new crawl dates");
			for (String crawlTime : newerCrawlTimes)
				crawlDocuments(watchedTarget, true, crawlTime, 2, null);
			Logger.info("Finished crawling " + watchedTarget.target.fieldUrls.get(0).url);
			return null;
		}

	}
	
	public static List<Document> crawlDocuments(WatchedTarget watchedTarget,
			boolean crawlWayback, String crawlTime, int depth, Integer maxDocuments) {
		Logger.debug("crawlDocuments of " + watchedTarget.target.fieldUrls.get(0).url + " (date: " + crawlTime + ")");
		List<Document> documentList = (new Crawler(crawlWayback)).crawlForDocuments(watchedTarget, crawlTime, depth, maxDocuments);
		List<Document> newDocumentList = Documents.filterNew(documentList);
		if (documentList.isEmpty()) {
			TargetController.raiseFlag(watchedTarget.target, "No Documents Found");
		} else {
			if (crawlWayback &&
					(watchedTarget.waybackTimestamp == null ||
					crawlTime.compareTo(watchedTarget.waybackTimestamp) > 0)) {
				WatchedTargets.setWaybackTimestamp(watchedTarget, crawlTime);
			}			
			Ebean.save(newDocumentList);
		}		
		return newDocumentList;
	}
	
	public static void crawlAndConvertDocuments(WatchedTarget watchedTarget,
			boolean crawlWayback, String crawlTime, int depth, Integer maxDocuments) {
		convertDocuments(crawlDocuments(watchedTarget, crawlWayback, crawlTime, depth, maxDocuments));
	}
	
	public static void convertDocuments(List<Document> newDocumentList) {
		String ctphFile = new SimpleDateFormat("yyyyMMddHHmmss").format(new Date()) + ".txt";
		for (Document document : newDocumentList) {
			try {
				int exitCode = convertPdfToHtml(document, ctphFile);
				boolean hashesExist = exitCode != 2;
				if (hashesExist) {
					if (exitCode != 0)
						Logger.error("can't convert document " + document.documentUrl + " to html");
					Documents.addHashes(document);
				} else {
					Logger.error("can't download document " + document.documentUrl);
				}
			} catch (Exception e) {
				Logger.error("error while converting document " + document.documentUrl + " to html", e);
			}
		}
		try {
			compareHashes(ctphFile);
			Documents.addDuplicateAlert(ctphFile);
		} catch (Exception e) {
			Logger.error("can't compare ctp hashes of " + ctphFile, e);
		}		
	}

	public static void convertDocuments() {
		List<Document> newDocumentList = Document.find.where().eq("sha256hash", null).findList();
		convertDocuments(newDocumentList);
	}

	private static int convertPdfToHtml(Document document, String ctphFile) throws IOException, InterruptedException {
		ProcessBuilder builder = new ProcessBuilder("/bin/bash", "-c",
				"cd conf/converter && ./convertPdfToHtml.sh '" + document.actualSourceUrl() +
				"' '" + document.id + "' " + ctphFile);
		builder.redirectErrorStream(true);
		Process p = builder.start();
		BufferedReader r = new BufferedReader(new InputStreamReader(p.getInputStream()));
		String line;
		while (true) {
			line = r.readLine();
 			if (line == null) { break; }
			Logger.debug(line);
		}
		return p.waitFor();
	}
	
	private static void compareHashes(String ctphFile) throws IOException, InterruptedException {
		ProcessBuilder builder = new ProcessBuilder("/bin/bash", "-c",
				"cd conf/converter && ./compareHashes.sh " + ctphFile);
		Process p = builder.start();
		if (p.waitFor() != 0) {
			throw new IOException();
		}
	}
	
	@Override
	public void onReceive(Object message) throws Exception {
		if (message instanceof CrawlMessage) {
			Logger.info("Starting crawl");
			List<WatchedTarget> watchedTargets = WatchedTarget.find.all();
			for (WatchedTarget watchedTarget : watchedTargets) {
				Promise.promise(new CrawlFunction(watchedTarget));
			}
			Akka.system().scheduler().scheduleOnce(
					Duration.create(1, TimeUnit.HOURS),
					getSelf(),
					new CrawlActor.ConvertMessage(),
					Akka.system().dispatcher(),
					null
			);
		} else if (message instanceof ConvertMessage) {
			Logger.info("Convert documents");
			convertDocuments();
		}
	}
}