/*
* This file is part of the Heritrix web crawler (crawler.archive.org).
*
* Licensed to the Internet Archive (IA) by one or more individual
* contributors.
*
* The IA licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.archive.crawler.framework;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.PrintWriter;
import java.io.Serializable;
import java.io.StringWriter;
import java.util.LinkedList;
import java.util.Map;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.archive.checkpointing.Checkpoint;
import org.archive.checkpointing.Checkpointable;
import org.archive.crawler.event.CrawlStateEvent;
import org.archive.crawler.reporting.AlertThreadGroup;
import org.archive.crawler.reporting.CrawlerLoggerModule;
import org.archive.crawler.reporting.StatisticsTracker;
import org.archive.modules.CandidateChain;
import org.archive.modules.CrawlMetadata;
import org.archive.modules.DispositionChain;
import org.archive.modules.FetchChain;
import org.archive.modules.net.ServerCache;
import org.archive.modules.seeds.SeedModule;
import org.archive.spring.ConfigPath;
import org.archive.util.ReportUtils;
import org.springframework.beans.BeansException;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.context.ApplicationContext;
import org.springframework.context.ApplicationContextAware;
import org.springframework.context.ApplicationEvent;
import org.springframework.context.Lifecycle;
import org.springframework.context.support.AbstractApplicationContext;
import org.xbill.DNS.DClass;
import org.xbill.DNS.Lookup;
/**
* CrawlController collects all the classes which cooperate to
* perform a crawl and provides a high-level interface to the
* running crawl.
*
* As the "global context" for a crawl, subcomponents will
* often reach each other through the CrawlController.
*
* @contributor gojomo
*/
public class CrawlController
implements Serializable,
Lifecycle,
ApplicationContextAware,
Checkpointable {
private static final long serialVersionUID = 1L;
// ApplicationContextAware implementation, for eventing
protected AbstractApplicationContext appCtx;
public void setApplicationContext(ApplicationContext applicationContext) throws BeansException {
this.appCtx = (AbstractApplicationContext)applicationContext;
}
protected CrawlMetadata metadata;
public CrawlMetadata getMetadata() {
return metadata;
}
@Autowired
public void setMetadata(CrawlMetadata provider) {
this.metadata = provider;
}
protected ServerCache serverCache;
public ServerCache getServerCache() {
return this.serverCache;
}
@Autowired
public void setServerCache(ServerCache serverCache) {
this.serverCache = serverCache;
}
/**
* The frontier to use for the crawl.
*/
protected Frontier frontier;
public Frontier getFrontier() {
return this.frontier;
}
@Autowired
public void setFrontier(Frontier frontier) {
this.frontier = frontier;
}
/**
* Scratch directory for temporary overflow-to-disk
*/
protected ConfigPath scratchDir =
new ConfigPath("scratch subdirectory","scratch");
public ConfigPath getScratchDir() {
return scratchDir;
}
public void setScratchDir(ConfigPath scratchDir) {
this.scratchDir = scratchDir;
}
/**
* Statistics tracking modules. Any number of specialized statistics
* trackers that monitor a crawl and write logs, reports and/or provide
* information to the user interface.
*/
protected StatisticsTracker statisticsTracker;
public StatisticsTracker getStatisticsTracker() {
return this.statisticsTracker;
}
@Autowired
public void setStatisticsTracker(StatisticsTracker statisticsTracker) {
this.statisticsTracker = statisticsTracker;
}
protected SeedModule seeds;
public SeedModule getSeeds() {
return this.seeds;
}
@Autowired
public void setSeeds(SeedModule seeds) {
this.seeds = seeds;
}
/**
* Fetch chain
*/
protected FetchChain fetchChain;
public FetchChain getFetchChain() {
return this.fetchChain;
}
@Autowired
public void setFetchChain(FetchChain fetchChain) {
this.fetchChain = fetchChain;
}
/**
* Disposition chain
*/
protected DispositionChain dispositionChain;
public DispositionChain getDispositionChain() {
return this.dispositionChain;
}
@Autowired
public void setDispositionChain(DispositionChain dispositionChain) {
this.dispositionChain = dispositionChain;
}
/**
* Candidate chain
*/
protected CandidateChain candidateChain;
public CandidateChain getCandidateChain() {
return this.candidateChain;
}
@Autowired
public void setCandidateChain(CandidateChain candidateChain) {
this.candidateChain = candidateChain;
}
/**
* Maximum number of threads processing URIs at the same time.
*/
protected int maxToeThreads;
public int getMaxToeThreads() {
return maxToeThreads;
}
@Value("25")
public void setMaxToeThreads(int maxToeThreads) {
this.maxToeThreads = maxToeThreads;
if(toePool!=null) {
toePool.setSize(this.maxToeThreads);
}
}
/** whether to keep running (without pause or finish) when frontier is empty */
protected boolean runWhileEmpty = false;
public boolean getRunWhileEmpty() {
return runWhileEmpty;
}
public void setRunWhileEmpty(boolean runWhileEmpty) {
this.runWhileEmpty = runWhileEmpty;
}
/** whether to pause at crawl start */
protected boolean pauseAtStart = true;
public boolean getPauseAtStart() {
return pauseAtStart;
}
public void setPauseAtStart(boolean pauseAtStart) {
this.pauseAtStart = pauseAtStart;
}
/**
* Size in bytes of in-memory buffer to record outbound traffic. One such
* buffer is reserved for every ToeThread.
*/
protected int recorderOutBufferBytes = 16 * 1024; // 16KiB
public int getRecorderOutBufferBytes() {
return recorderOutBufferBytes;
}
public void setRecorderOutBufferBytes(int recorderOutBufferBytes) {
this.recorderOutBufferBytes = recorderOutBufferBytes;
}
/**
* Size in bytes of in-memory buffer to record inbound traffic. One such
* buffer is reserved for every ToeThread.
*/
protected int recorderInBufferBytes = 512 * 1024; // 512KiB
public int getRecorderInBufferBytes() {
return recorderInBufferBytes;
}
public void setRecorderInBufferBytes(int recorderInBufferBytes) {
this.recorderInBufferBytes = recorderInBufferBytes;
}
protected CrawlerLoggerModule loggerModule;
public CrawlerLoggerModule getLoggerModule() {
return this.loggerModule;
}
@Autowired
public void setLoggerModule(CrawlerLoggerModule loggerModule) {
this.loggerModule = loggerModule;
}
/**
* Messages from the crawlcontroller.
*
* They appear on console.
*/
private final static Logger LOGGER =
Logger.getLogger(CrawlController.class.getName());
private transient ToePool toePool;
// emergency reserve of memory to allow some progress/reporting after OOM
private transient LinkedList<byte[]> reserveMemory;
private static final int RESERVE_BLOCKS = 1;
private static final int RESERVE_BLOCK_SIZE = 12*1024*1024; // 12 MB
/**
* Crawl exit status.
*/
private transient CrawlStatus sExit = CrawlStatus.CREATED;
public static enum State {
NASCENT, RUNNING, EMPTY, PAUSED, PAUSING,
STOPPING, FINISHED, PREPARING
}
transient private State state = State.NASCENT;
public CrawlController() {
}
transient protected AlertThreadGroup alertThreadGroup;
public void start() {
// cache AlertThreadGroup for later ToePool launch
AlertThreadGroup atg = AlertThreadGroup.current();
if(atg!=null) {
alertThreadGroup = atg;
}
if(isRunning) {
return;
}
sExit = CrawlStatus.FINISHED_ABNORMAL;
// force creation of DNS Cache now -- avoids CacheCleaner in toe-threads group
// also cap size at 1 (we never wanta cached value; 0 is non-operative)
Lookup.getDefaultCache(DClass.IN).setMaxEntries(1);
reserveMemory = new LinkedList<byte[]>();
for(int i = 0; i < RESERVE_BLOCKS; i++) {
reserveMemory.add(new byte[RESERVE_BLOCK_SIZE]);
}
isRunning = true;
}
protected boolean isRunning = false;
public boolean isRunning() {
return isRunning;
}
public void stop() {
// TODO: more stop/cleanup?
isRunning = false;
}
/**
* Send crawl change event to all listeners.
* @param newState State change we're to tell listeners' about.
* @param message Message on state change.
*/
protected void sendCrawlStateChangeEvent(State newState,
CrawlStatus status) {
if(this.state == newState) {
// suppress duplicate state-reports
return;
}
this.state = newState;
LOGGER.fine("reached CrawlController.State " + this.state + ", notifying listeners");
CrawlStateEvent event = new CrawlStateEvent(this,newState,status.getDescription());
appCtx.publishEvent(event);
}
// TODO: provide better knowledge/guard against twice-starting
protected boolean hasStarted = false;
public boolean hasStarted() {
return hasStarted;
}
protected boolean isStopComplete = false;
public boolean isStopComplete() {
return isStopComplete;
}
/**
* Operator requested crawl begin
*/
public void requestCrawlStart() {
hasStarted = true;
sendCrawlStateChangeEvent(State.PREPARING, CrawlStatus.PREPARING);
if(recoveryCheckpoint==null) {
// only announce (trigger scheduling of) seeds
// when doing a cold (non-recovery) start
getSeeds().announceSeeds();
}
setupToePool();
// A proper exit will change this value.
this.sExit = CrawlStatus.FINISHED_ABNORMAL;
if (getPauseAtStart()) {
// frontier is already paused unless started, so just
// 'complete'/ack pause
completePause();
} else {
getFrontier().run();
}
}
/**
* Called when the last toethread exits.
*/
protected void completeStop() {
if (!isRunning) {
return;
}
LOGGER.fine("Entered complete stop.");
statisticsTracker.getSnapshot(); // ???
this.reserveMemory = null;
if (this.toePool != null) {
this.toePool.cleanup();
}
this.toePool = null;
LOGGER.fine("Finished crawl.");
try {
if (appCtx.isRunning()) {
appCtx.stop();
}
} catch (RuntimeException re) {
LOGGER.log(Level.SEVERE,re.getMessage(),re);
}
sendCrawlStateChangeEvent(State.FINISHED, this.sExit);
// CrawlJob needs to be sure all beans have received FINISHED signal before teardown
this.isStopComplete = true;
appCtx.publishEvent(new StopCompleteEvent(this));
}
public static class StopCompleteEvent extends ApplicationEvent {
private static final long serialVersionUID = 1L;
public StopCompleteEvent(Object source) {
super(source);
}
}
protected synchronized void completePause() {
sendCrawlStateChangeEvent(State.PAUSED, CrawlStatus.PAUSED);
}
private boolean shouldContinueCrawling() {
Frontier frontier = getFrontier();
if (frontier.isEmpty() && !getRunWhileEmpty()) {
this.sExit = CrawlStatus.FINISHED;
return false;
}
// unsure this is correct; perhaps should be constant true
return isActive();
}
/**
* Operator requested for crawl to stop.
*/
public synchronized void requestCrawlStop() {
if(state == State.STOPPING) {
// second stop request; nudge the threads with interrupts
getToePool().cleanup();
}
requestCrawlStop(CrawlStatus.ABORTED);
}
/**
* Operator requested for crawl to stop.
* @param message
*/
public synchronized void requestCrawlStop(CrawlStatus message) {
if (state == State.NASCENT) {
this.sExit = message;
this.state = State.FINISHED;
this.isStopComplete = true;
}
if (state == State.STOPPING || state == State.FINISHED ) {
return;
}
if (message == null) {
throw new IllegalArgumentException("Message cannot be null.");
}
if(this.sExit != CrawlStatus.FINISHED) {
// don't clobber an already-FINISHED with alternate status
this.sExit = message;
}
beginCrawlStop();
}
/**
* Start the process of stopping the crawl.
*/
public void beginCrawlStop() {
LOGGER.fine("Started.");
sendCrawlStateChangeEvent(State.STOPPING, this.sExit);
Frontier frontier = getFrontier();
if (frontier != null) {
frontier.terminate();
}
LOGGER.fine("Finished.");
}
/**
* Stop the crawl temporarly.
*/
public synchronized void requestCrawlPause() {
if (state == State.PAUSING || state == State.PAUSED) {
// Already about to pause
return;
}
sExit = CrawlStatus.WAITING_FOR_PAUSE;
getFrontier().pause();
sendCrawlStateChangeEvent(State.PAUSING, this.sExit);
// wait for pause to come via frontier changes
}
/**
* Tell if the controller is paused
* @return true if paused
*/
public boolean isPaused() {
return state == State.PAUSED;
}
public boolean isPausing() {
return state == State.PAUSING;
}
/**
* Is this crawl actively able/trying to crawl? Includes both
* states RUNNING and EMPTY.
* @return
*/
public boolean isActive() {
return state == State.RUNNING || state == State.EMPTY;
}
public boolean isFinished() {
return state == State.FINISHED;
}
/**
* Resume crawl from paused state
*/
public void requestCrawlResume() {
if (state != State.PAUSING && state != State.PAUSED) {
// Can't resume if not been told to pause
return;
}
assert toePool != null;
Frontier f = getFrontier();
f.unpause();
sendCrawlStateChangeEvent(State.RUNNING, CrawlStatus.RUNNING);
}
/**
* @return Active toe thread count.
*/
public int getActiveToeCount() {
if (toePool == null) {
return 0;
}
return toePool.getActiveToeCount();
}
protected void setupToePool() {
toePool = new ToePool(alertThreadGroup,this);
// TODO: make # of toes self-optimizing
toePool.setSize(getMaxToeThreads());
toePool.waitForAll();
}
/**
* @return The number of ToeThreads
*
* @see ToePool#getToeCount()
*/
public int getToeCount() {
return this.toePool == null? 0: this.toePool.getToeCount();
}
/**
* @return The ToePool
*/
public ToePool getToePool() {
return toePool;
}
/**
* Kills a thread. For details see
* {@link org.archive.crawler.framework.ToePool#killThread(int, boolean)
* ToePool.killThread(int, boolean)}.
* @param threadNumber Thread to kill.
* @param replace Should thread be replaced.
* @see org.archive.crawler.framework.ToePool#killThread(int, boolean)
*/
public void killThread(int threadNumber, boolean replace){
toePool.killThread(threadNumber, replace);
}
/**
* Evaluate if the crawl should stop because it is finished,
* without actually stopping the crawl.
*
* @return true if crawl is at a finish-possible state
*/
public boolean atFinish() {
return isActive() && !shouldContinueCrawling();
}
private void readObject(ObjectInputStream stream)
throws IOException, ClassNotFoundException {
this.state = State.PAUSED;
stream.defaultReadObject();
}
public void freeReserveMemory() {
if(!reserveMemory.isEmpty()) {
reserveMemory.removeLast();
System.gc();
}
}
/**
* Log to the progress statistics log.
* @param msg Message to write the progress statistics log.
*/
public void logProgressStatistics(final String msg) {
loggerModule.getProgressStats().info(msg);
}
/**
* @return CrawlController state.
*/
public Object getState() {
return this.state;
}
public CrawlStatus getCrawlExitStatus() {
return this.sExit;
}
public String getToeThreadReport() {
if(toePool==null) {
return "no ToeThreads";
}
StringWriter sw = new StringWriter();
toePool.reportTo(new PrintWriter(sw));
return sw.toString();
}
public String getToeThreadReportShort() {
return (toePool == null) ? "" : ReportUtils.shortReportLine(toePool);
}
public Map<String,Object> getToeThreadReportShortData() {
return toePool == null ? null : toePool.shortReportMap();
}
public String getFrontierReportShort() {
return ReportUtils.shortReportLine(getFrontier());
}
/**
* Receive notification from the frontier, in the frontier's own
* manager thread, that the frontier has reached a new state.
*
* @param reachedState the state the frontier has reached
*/
public void noteFrontierState(Frontier.State reachedState) {
switch (reachedState) {
case RUN:
LOGGER.info("Crawl running.");
sendCrawlStateChangeEvent(State.RUNNING, CrawlStatus.RUNNING);
break;
case EMPTY:
LOGGER.info("Crawl empty.");
if(!getRunWhileEmpty()) {
this.sExit = CrawlStatus.FINISHED;
beginCrawlStop();
}
sendCrawlStateChangeEvent(State.EMPTY, CrawlStatus.RUNNING);
break;
case PAUSE:
if (state == State.PAUSING) {
completePause();
}
break;
case FINISH:
completeStop();
break;
default:
// do nothing
}
}
// Checkpointable
// CrawlController's only interest is in knowing that a Checkpoint is
// being recovered
public void startCheckpoint(Checkpoint checkpointInProgress) {}
public void doCheckpoint(Checkpoint checkpointInProgress) throws IOException {}
public void finishCheckpoint(Checkpoint checkpointInProgress) {}
protected Checkpoint recoveryCheckpoint;
public void setRecoveryCheckpoint(Checkpoint recoveryCheckpoint) {
this.recoveryCheckpoint = recoveryCheckpoint;
}
}//EOC