/*
* This file is part of the Heritrix web crawler (crawler.archive.org).
*
* Licensed to the Internet Archive (IA) by one or more individual
* contributors.
*
* The IA licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.archive.crawler.framework;
import java.io.File;
import java.io.FileFilter;
import java.io.IOException;
import java.io.PrintWriter;
import java.io.StringWriter;
import java.util.Arrays;
import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit;
import java.util.logging.Level;
import java.util.logging.Logger;
import javax.script.Bindings;
import javax.script.ScriptEngine;
import javax.script.ScriptEngineManager;
import javax.script.ScriptException;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.filefilter.FileFilterUtils;
import org.apache.commons.lang.StringUtils;
import org.archive.modules.seeds.SeedModule;
import org.archive.spring.ConfigPath;
import org.archive.util.ArchiveUtils;
import org.archive.util.FilesystemLinkMaker;
import org.springframework.beans.BeansException;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.context.ApplicationContext;
import org.springframework.context.ApplicationContextAware;
import org.springframework.context.Lifecycle;
/**
* Directory watched for new files. Depending on their extension, will
* process with regard to current crawl, and rename with a datestamp
* into the 'done' directory.
*
* Currently supports:
* - .seeds(.gz)
* add each URI found in file as a new seed (to be crawled
* if not already; to affect scope if appropriate).
* - (.s).recover(.gz)
* treat as traditional recovery log: consider all 'Fs'-tagged lines
* included, then try-rescheduling all 'F+'-tagged lines. (If ".s."
* present, try scoping URIs before including/scheduling.)
* - (.s).include(.gz)
* add each URI found in a recover-log like file (regardless of its
* tagging) to the frontier's alreadyIncluded filter, preventing them
* from being recrawled. ('.s.' indicates to apply scoping.)
* - (.s).schedule(.gz)
* add each URI found in a recover-log like file (regardless of its
* tagging) to the frontier's queues. ('.s.' indicates to apply
* scoping.)
*
* Future support planned:
* - .robots: invalidate robots ASAP
* - (?) .block: block-all on named site(s)
* - .overlay: add new overlay settings
* - .js .rb .bsh .rb etc - execute arbitrary script (a la ScriptedProcessor)
*
* @contributor gojomo
*/
public class ActionDirectory implements ApplicationContextAware, Lifecycle, Runnable {
final private static Logger LOGGER =
Logger.getLogger(ActionDirectory.class.getName());
protected ScheduledExecutorService executor;
/** how long after crawl start to first scan action directory */
protected int initialDelaySeconds = 10;
public int getInitialDelaySeconds() {
return initialDelaySeconds;
}
public void setInitialDelaySeconds(int initialDelay) {
this.initialDelaySeconds = initialDelay;
}
/** delay between scans of actionDirectory for new files */
protected int delaySeconds = 30;
public int getDelaySeconds() {
return delaySeconds;
}
public void setDelaySeconds(int delay) {
this.delaySeconds = delay;
}
protected ConfigPath actionDir =
new ConfigPath("ActionDirectory source directory","action");
public ConfigPath getActionDir() {
return actionDir;
}
public void setActionDir(ConfigPath actionDir) {
this.actionDir = actionDir;
}
protected ConfigPath doneDir =
new ConfigPath("ActionDirectory done directory","${launchId}/actions-done");
public ConfigPath getDoneDir() {
return doneDir;
}
public void setDoneDir(ConfigPath doneDir) {
this.doneDir = doneDir;
}
protected ApplicationContext appCtx;
public void setApplicationContext(ApplicationContext applicationContext) throws BeansException {
this.appCtx = applicationContext;
}
protected SeedModule seeds;
public SeedModule getSeeds() {
return this.seeds;
}
@Autowired
public void setSeeds(SeedModule seeds) {
this.seeds = seeds;
}
/** autowired frontier for actions */
protected Frontier frontier;
public Frontier getFrontier() {
return this.frontier;
}
@Autowired
public void setFrontier(Frontier frontier) {
this.frontier = frontier;
}
public boolean isRunning() {
return executor != null && !executor.isShutdown();
}
public void start() {
if (isRunning()) {
return;
}
try {
// create directories
org.archive.util.FileUtils.ensureWriteableDirectory(getActionDir().getFile());
org.archive.util.FileUtils.ensureWriteableDirectory(getDoneDir().getFile());
} catch (IOException e) {
throw new IllegalStateException(e);
}
// start background executor
executor = Executors.newSingleThreadScheduledExecutor();
executor.scheduleWithFixedDelay(this, getInitialDelaySeconds(), getDelaySeconds(), TimeUnit.SECONDS);
}
public void stop() {
executor.shutdown();
try {
while (!executor.awaitTermination(10, TimeUnit.SECONDS));
} catch (InterruptedException e) {
// do nothing
}
}
/**
* Action taken at scheduled intervals
* @see java.lang.Runnable#run()
*/
public void run() {
scanActionDirectory();
}
/**
* Find any new files in the 'action' directory; process each in
* order.
*/
protected void scanActionDirectory() {
File dir = actionDir.getFile();
File[] files = dir.listFiles((FileFilter)FileFilterUtils.fileFileFilter());
Arrays.sort(files);
for(File f : files) {
try {
actOn(f);
} catch (Throwable e) {
LOGGER.log(Level.SEVERE,"unhandled exception from actifile: "+f,e);
}
}
}
/**
* Process an individual action file found
*
* @param actionFile File to process
*/
protected void actOn(File actionFile) {
LOGGER.info("processing action file: "+actionFile);
String filename = actionFile.getName();
boolean isGzip = filename.endsWith(".gz");
String corename = isGzip ? filename.substring(0,filename.length()-3) : filename;
String timestamp = ArchiveUtils.get17DigitDate();
if(corename.endsWith(".seeds")) {
// import seeds
getSeeds().actOn(actionFile);
} else if (corename.endsWith(".recover")) {
// apply recovery-log
boolean alsoScope = corename.endsWith(".s.recover");
try {
// consider-included all successes and explicit-includes...
getFrontier().importRecoverFormat(actionFile, alsoScope, true, false, "F[si] ");
// then retry all adds...
getFrontier().importRecoverFormat(actionFile, alsoScope, false, false, "F\\+ ");
} catch (IOException ioe) {
LOGGER.log(Level.SEVERE,"problem with action file: "+actionFile,ioe);
}
} else if (corename.endsWith(".include")) {
// consider-included-only (do not schedule)
boolean alsoScope = corename.endsWith(".s.include");
try {
getFrontier().importRecoverFormat(actionFile, alsoScope, true, false, ".*");
} catch (IOException ioe) {
LOGGER.log(Level.SEVERE,"problem with action file: "+actionFile,ioe);
}
} else if (corename.endsWith(".schedule")) {
// schedule to queues
boolean alsoScope = corename.endsWith(".s.schedule");
try {
getFrontier().importRecoverFormat(actionFile, alsoScope, false, false, ".*");
} catch (IOException ioe) {
LOGGER.log(Level.SEVERE,"problem with action file: "+actionFile,ioe);
}
} else if (corename.endsWith(".force")) {
// schedule to queues
boolean alsoScope = corename.endsWith(".s.force");
try {
getFrontier().importRecoverFormat(actionFile, alsoScope, false, true, ".*");
} catch (IOException ioe) {
LOGGER.log(Level.SEVERE,"problem with action file: "+actionFile,ioe);
}
// } else if (filename.endsWith(".robots")) {
// // force refresh of robots
// // TODO
} else if(!tryAsScript(actionFile,timestamp)) {
LOGGER.warning("action file ignored: "+actionFile);
}
// move file to 'done' area with timestamp prefix
while(actionFile.exists()) {
try {
File doneFile = new File(doneDir.getFile(),timestamp+"."+actionFile.getName());
FileUtils.moveFile(actionFile, doneFile);
// attempt to symlink from action/done/ to done file
File actionDoneDirFile = new File(actionDir.getFile(), "done");
if (!actionDoneDirFile.equals(doneDir.getFile())) {
actionDoneDirFile.mkdirs();
File doneSymlinkFile = new File(actionDoneDirFile, doneFile.getName());
boolean success = FilesystemLinkMaker.makeSymbolicLink(doneFile.getPath(), doneSymlinkFile.getPath());
if (!success) {
LOGGER.warning("failed to create symlink from " + doneSymlinkFile + " to " + doneFile);
}
}
} catch (IOException e) {
LOGGER.log(Level.SEVERE,"unable to move "+actionFile,e);
}
}
}
/** shared ScriptEngineManager */
protected static ScriptEngineManager MANAGER = new ScriptEngineManager();
/**
* Try the actionFile as a script, deducing the proper scripting
* language from its file extension. Return true if evaluation was
* tried with a known script engine.
*
* Provides 'appCtx' and 'rawOut' to script for accessing crawl
* and outputting text to a '.out' file paired with the 'done/'
* action file. If an exception occurs, it will be logged to an
* '.ex' file alongside the script file in 'done/'.
*
* @param actionFile file to try
* @param timestamp timestamp correlating out/ex files with done script
* @return true if engine evaluation began (even if an error occurred)
*/
protected boolean tryAsScript(File actionFile, String timestamp) {
int i = actionFile.getName().lastIndexOf(".");
if(i<0) {
return false;
}
// deduce language/engine from extension
String extension = actionFile.getName().substring(i+1);
ScriptEngine engine = MANAGER.getEngineByExtension(extension);
if(engine==null) {
return false;
}
// prepare engine
StringWriter rawString = new StringWriter();
PrintWriter rawOut = new PrintWriter(rawString);
Exception ex = null;
Bindings bindings = new BeanLookupBindings(appCtx);
bindings.put("rawOut", rawOut);
bindings.put("appCtx", appCtx);
// evaluate and record any exception
try {
String script = FileUtils.readFileToString(actionFile);
engine.eval(script, bindings);
} catch (IOException e) {
ex = e;
} catch (ScriptException e) {
ex = e;
} catch (RuntimeException e) {
ex = e;
} finally {
// the script could create an object that persists and retains a reference to the Bindings
bindings.put("rawOut", null);
bindings.put("appCtx", null);
}
// report output/exception to files paired with script in done dir
rawOut.flush();
String allOut = rawString.toString();
if(StringUtils.isNotBlank(allOut)) {
File outFile = new File(doneDir.getFile(),timestamp+"."+actionFile.getName()+".out");
try {
FileUtils.writeStringToFile(outFile, rawString.toString());
} catch (IOException ioe) {
LOGGER.log(Level.SEVERE,"problem during action file: "+actionFile,ioe);
}
}
if(ex!=null) {
File exFile = new File(doneDir.getFile(),timestamp+"."+actionFile.getName()+".exception");
try {
FileUtils.writeStringToFile(exFile, ex.toString());
} catch (IOException ioe) {
LOGGER.log(Level.SEVERE,"problem during action file: "+actionFile,ioe);
}
}
return true;
}
}