package uk.bl.monitrix.heritrix.ingest;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.concurrent.Callable;
import play.Logger;
import uk.bl.monitrix.database.DBIngestConnector;
import uk.bl.monitrix.heritrix.IncrementalLogfileReader;
import uk.bl.monitrix.heritrix.LogFileEntry;
import uk.bl.monitrix.heritrix.ingest.IngestStatus.Phase;
import uk.bl.monitrix.model.IngestSchedule;
import uk.bl.monitrix.model.IngestedLog;
import akka.actor.ActorRef;
import akka.actor.ActorSystem;
import akka.actor.UntypedActor;
import scala.concurrent.Future;
import akka.dispatch.Futures;
import akka.dispatch.OnSuccess;
/**
* This class coordinates the actual work of monintoring log files and
* running incremental ingest.
* @author Rainer Simon <rainer.simon@ait.ac.at>
*/
public class IngestActor extends UntypedActor {
// The number of lines to average when estimating total no. of lines per log file
private static final int LINE_NUMBER_ESTIMATION_SAMPLE_SIZE = 50000;
private DBIngestConnector db;
private ActorSystem system;
private static long sleepInterval = 2000;
/**
* The startup interval is the time for initial log file synchronization. After this time interval has elapsed,
* unavailable log files will be flagged as 'unavailable' and it will be continuously checked if they become
* available again.
*/
private long startUpInterval;
private Map<String, WatchedLog> newLogs = new HashMap<String, WatchedLog>();
private Map<String, WatchedLog> watchedLogs = new HashMap<String, WatchedLog>();
private Map<String, IngestStatus> statusList = new HashMap<String, IngestStatus>();
private boolean isRunning = false;
private boolean keepRunning = true;
public IngestActor(DBIngestConnector db, ActorSystem system) {
this.db = db;
this.system = system;
}
@Override
public void onReceive(Object arg) throws Exception {
IngestControlMessage msg = (IngestControlMessage) arg;
if (msg.getCommand().equals(IngestControlMessage.Command.START)) {
if (!isRunning) {
isRunning = true;
startSynchronizationLoop();
}
} else if (msg.getCommand().equals(IngestControlMessage.Command.STOP)) {
keepRunning = false;
} else if (msg.getCommand().equals(IngestControlMessage.Command.GET_STATUS)) {
// Compute progress for logs in CATCHING_UP phase
for (Entry<String, IngestStatus> entry : statusList.entrySet()) {
IngestStatus status = entry.getValue();
if (status.phase.equals(IngestStatus.Phase.CATCHING_UP)) {
WatchedLog log = newLogs.get(entry.getKey());
if(log != null) {
status.progress = (int) ((100 * log.getReader().getNumberOfLinesRead()) / log.getEstimatedLineCount());
}
}
}
getSender().tell(statusList, ActorRef.noSender());
} else if (msg.getCommand().equals(IngestControlMessage.Command.SYNC_WITH_SCHEDULE)) {
for (IngestedLog log : db.getIngestSchedule().getLogs()) {
String id = log.getId();
if (!watchedLogs.containsKey(id) && !newLogs.containsKey(id)) {
statusList.put(id, new IngestStatus(Phase.PENDING));
estimateLinesAsync(log, LINE_NUMBER_ESTIMATION_SAMPLE_SIZE);
}
}
// TODO update monitored/not-monitored info
} else if (msg.getCommand().equals(IngestControlMessage.Command.CHANGE_SLEEP_INTERVAL)) {
Long newInterval = (Long) msg.getPayload();
sleepInterval = newInterval.longValue();
} else if (msg.getCommand().equals(IngestControlMessage.Command.CHECK_RUNNING)) {
getSender().tell(isRunning, ActorRef.noSender());
}
}
/**
* Estimates the total number of lines in a text file based on computing the byte-size of the first
* <code>maxLines</code> lines, and assuming that all other lines will have the same average byte-size.
*
* <strong>Warning #1:</strong> This method assumes 8-bit-per-character encoding (e.g. UTF-8)! Detecting
* encoding would be computationally to intensive. If the log file were encoded with a N bytes-per-character,
* the result of this method will be too high by the factor N.
*
* <strong>Warning #2:</strong> the result of this method will be too high if the file
* is method will yield a result which is too high, if the file has less than <code>maxLines</code> lines.
* (This limitation should be irrelevant for the monitrix use case, however.)
*
* @param path the file path
* @param maxLines the maximum number of lines to sample from the file
*/
private void estimateLinesAsync(final IngestedLog log, final int maxLines) {
Future<Long> f = Futures.future(new Callable<Long>() {
@Override
public Long call() throws Exception {
Logger.info("Estimating number of lines for " + log.getPath());
// Count number of characters in the first N lines
BufferedReader reader = new BufferedReader(new FileReader(log.getPath()));
long characters = 0;
try {
int lineCount = 0;
String line;
while ((line = reader.readLine()) != null && lineCount < maxLines) {
characters += line.length();
lineCount++;
}
} finally {
reader.close();
}
Logger.info("Sample lines take up " + characters / (1024 * 1024) + " MB (assuming 1 byte per character)");
File f = new File(log.getPath());
long bytesTotal = f.length();
Logger.info("File has " + bytesTotal / (1024 * 1024) + " MB total");
double ratio = (double) bytesTotal / (double) characters;
long estimatedLines = (long) (maxLines * ratio);
Logger.info("Estimated a total number of " + estimatedLines + " lines");
return estimatedLines;
}
}, system.dispatcher());
Logger.info("Setting an onSuccess handler for watching " + log.getPath());
f.onSuccess(new OnSuccess<Long>() {
@Override
public void onSuccess(Long estimatedLineCount) throws Throwable {
Logger.info("Adding a WatchedLog for " + log.getPath());
WatchedLog watchedLog = new WatchedLog(log, new IncrementalLogfileReader(log.getPath()));
watchedLog.setEstimatedLineCount(estimatedLineCount);
newLogs.put(log.getId(), watchedLog);
}
}, system.dispatcher());
}
private void startSynchronizationLoop() throws InterruptedException, IOException {
Futures.future(new Callable<Void>() {
@Override
public Void call() throws Exception {
Logger.info("Starting log synchronization loop, " + keepRunning + ", " + newLogs.size());
// initial startup interval time
startUpInterval = 60 * sleepInterval;
IngestSchedule schedule = db.getIngestSchedule();
while (keepRunning) {
// Check if there are new logs to watch - and ingest if any
List<WatchedLog> toAdd = new ArrayList<WatchedLog>();
for (WatchedLog watchedLog : newLogs.values())
toAdd.add(watchedLog);
for (WatchedLog log : toAdd) {
//Logger.info("Checking "+log.getLogInfo());
// TODO performance improvement
if (schedule.isMonitoringEnabled(log.getLogInfo().getId())) {
Logger.info("Catching up with log file " + log.getReader().getPath());
catchUpWithLog(log);
Logger.info("Caught up with log file " + log.getReader().getPath());
newLogs.remove(log.getLogInfo().getId());
}
}
// Sync all other logs
for (WatchedLog log : watchedLogs.values()) {
// TODO performance improvement
String logId = log.getLogInfo().getId();
String logPath = log.getLogInfo().getPath();
if (schedule.isMonitoringEnabled(logId)) {
// Check if file was renamed in the mean time.
// Remove file from currently watched log files if it was moved or renamed
if(log.getReader().isRenamed()) {
Logger.debug("Log file "+logId+" registered at " +log.getLogInfo().getPath()+" was renamed or moved (status becomes 'Unavailable'");
//log.setReader(new IncrementalLogfileReader(log.getLogInfo().getPath()));
IngestStatus status = statusList.get(log.getLogInfo().getId());
status.phase = Phase.UNAVAILABLE;
Logger.info("Setting status to: "+status.phase.name());
} else {
Logger.debug("Synchronizing log: " + logPath);
synchronizeWithLog(log);
}
}
}
// Add new Logs to synchroniziation loop
for (WatchedLog log : toAdd)
watchedLogs.put(log.getLogInfo().getId(), log);
// If the startup interval has elapsed, unavailable log files will be flagged as 'unavailable' and
// it will be continuously checked if they become available again.
if(startUpInterval < 0 && watchedLogs.size() < db.getIngestSchedule().getLogs().size()) {
Logger.warn("There are registered log files not available in the file system");
for (IngestedLog log : db.getIngestSchedule().getLogs()) {
String id = log.getId();
if (!watchedLogs.containsKey(id) && !newLogs.containsKey(id)) {
IngestStatus status = statusList.get(log.getId());
if(!(new File(log.getPath()).exists())) {
Logger.info("Registered log file not available at " + log.getPath());
status.phase = Phase.UNAVAILABLE;
Logger.info("Setting status to: "+status.phase.name());
} else {
Logger.info("Adding a WatchedLog for " + log.getPath());
WatchedLog watchedLog = new WatchedLog(log, new IncrementalLogfileReader(log.getPath()));
status.phase = Phase.CATCHING_UP;
catchUpWithLog(watchedLog);
watchedLogs.put(id,watchedLog);
}
}
}
}
// startup interval is decreased after each pass
startUpInterval -= sleepInterval;
// Go to sleep
Thread.sleep(sleepInterval);
}
Logger.info("Stopping synchronization loop");
return null;
}
}, system.dispatcher());
}
private void catchUpWithLog(WatchedLog log) {
try {
IncrementalLogfileReader reader = log.getReader();
long linesToSkip = db.getIngestSchedule().getLogForPath(reader.getPath()).getIngestedLines();
Logger.info("Skipping " + linesToSkip + " (of estimated " + log.getEstimatedLineCount() + " log lines)");
IngestStatus status = statusList.get(log.getLogInfo().getId());
status.phase = IngestStatus.Phase.CATCHING_UP;
reader.skipLines(linesToSkip);
Iterator<LogFileEntry> iterator = reader.newIterator();
Logger.info("Catching up with log file contents");
db.insert(log.getLogInfo().getId(), iterator);
status.phase = IngestStatus.Phase.IDLE;
status.progress = 0;
} catch (Throwable t) {
t.printStackTrace();
}
}
private void synchronizeWithLog(WatchedLog log) {
try {
IngestStatus status = statusList.get(log.getLogInfo().getId());
status.phase = IngestStatus.Phase.SYNCHRONIZING;
db.insert(log.getLogInfo().getId(), log.getReader().newIterator());
status.phase = IngestStatus.Phase.IDLE;
} catch (Throwable t) {
t.printStackTrace();
}
}
}