/**
*
*/
package ecologylab.bigsemantics.collecting;
import java.util.ArrayList;
import java.util.Observable;
import java.util.Observer;
import ecologylab.appframework.types.prefs.PrefBoolean;
import ecologylab.bigsemantics.documentparsers.RichDocumentParserCrawlerResult;
import ecologylab.bigsemantics.gui.InteractiveSpace;
import ecologylab.bigsemantics.metadata.builtins.RichDocument;
import ecologylab.bigsemantics.metadata.builtins.Document;
import ecologylab.bigsemantics.metadata.builtins.DocumentClosure;
import ecologylab.bigsemantics.metadata.builtins.TextClipping;
import ecologylab.bigsemantics.model.text.InterestModel;
import ecologylab.bigsemantics.model.text.TermVector;
import ecologylab.bigsemantics.model.text.TermVectorWeightStrategy;
import ecologylab.bigsemantics.seeding.SemanticsPrefs;
import ecologylab.collections.GenericElement;
import ecologylab.collections.PrioritizedPool;
import ecologylab.collections.WeightSet;
import ecologylab.generic.Debug;
import ecologylab.generic.Generic;
import ecologylab.generic.ThreadMaster;
/**
* Crawler will encapsulate all state regarding web crawling.
* This means WeightSets and PriorizedPools of candidate unparsed simple Document + CompoundDocument objects.
*
* @author andruid
*/
public class Crawler extends Debug
implements Observer, ThreadMaster, Runnable, SemanticsPrefs
{
protected SemanticsSessionScope semanticsSessionScope;
Seeding seeding;
/**
* Priority to run at when we seem to have an overabundance of <code>MediaElement</code>s ready
* for display.
*/
protected static final int LO_PRIORITY = 3;
/**
* Priority to run at normally.
*/
protected static final int MID_PRIORITY = 4;
/**
* Priority to run at when we do not have enough <code>MediaElement</code>s ready for display.
*/
protected static final int HI_PRIORITY = 5;
/**
* Initial priority, which is {@link #LO_PRIORITY LO_PRIORITY}, because we expect to need to do a
* bunch of crawling at the start, since our global collections will be empty.
*/
protected static final int PRIORITY = MID_PRIORITY;
/**
* A constant for initializing our large HashTables.
*/
static final float HASH_LOAD = .5f;
/**
* When the candidateContainersPool has more entries than this, it will be pruned.
*/
// static final int MAX_PAGES = 2048;
static final int MAX_PAGES = 4096; // 3072;
public static final int NUM_GENERATIONS_IN_CONTAINER_POOL = 5;
public static final int MAX_PAGES_PER_GENERATION = MAX_PAGES / NUM_GENERATIONS_IN_CONTAINER_POOL ;
private final PrioritizedPool<DocumentClosure> candidateDocumentClosuresPool;
/**
* true when the session is ending.
*/
protected boolean finished;
/**
* Controls whether or not the crawler is (temporarily) paused.
*/
protected boolean running = true;
boolean crashed;
/**
* Web crawler sleep time when we are in need of collecting more media.
*/
protected final int usualSleep;
/**
* Web crawler sleep time when there seems to be plenty of media already collected.
*/
protected final int longSleep;
/**
* Controls whether or not we periodically automatically download the DocumentClosures associated with
* outlinks that have been discovered. In other words, controls whether or not we crawl at all. Set
* as a preference at runtime, and via a menu entry.
*/
protected PrefBoolean downloadLinksAutomatically = CRAWL;
// +++++++++++ state for thread +++++++++++ //
/**
* The web crawler thread.
*/
protected Thread thread;
/**
* The number of loops through the web crawler. A performance statistic that roughly corresponds
* to how many new <code>Container</code>s have been queued for download and parse, during this
* session.
*/
protected int count;
protected boolean collectingImages;
public boolean isCollectingImages()
{
return collectingImages;
}
public boolean isCollectingText()
{
return collectingText;
}
protected boolean collectingText;
/**
*
*/
public Crawler()
{
TermVector piv = InterestModel.getPIV();
piv.addObserver(this);
WeightSet<DocumentClosure>[] documentClosureWeightSets = new WeightSet[NUM_GENERATIONS_IN_CONTAINER_POOL];
for(int i = 0; i < NUM_GENERATIONS_IN_CONTAINER_POOL; i++)
documentClosureWeightSets[i] = new WeightSet<DocumentClosure>(MAX_PAGES_PER_GENERATION, this,
(TermVectorWeightStrategy) new DownloadContainerWeightingStrategy(piv));
candidateDocumentClosuresPool = new PrioritizedPool<DocumentClosure>(documentClosureWeightSets);
finished = false;
usualSleep = 3000;
longSleep = usualSleep * 5 / 2;
}
/**
* Stop the threads that are responsible for collecting new surrogates.
* Includes the candidateImageVisualsPool, crawler and seeding download monitors.
* @param kill
*/
public void stopCollectingAgents(boolean kill)
{
semanticsSessionScope.getDownloadMonitors().stop(kill);
}
final Object startCrawlerSemaphore = new Object();
//FIXME
public synchronized void start()
{
if (downloadLinksAutomatically.value())
{
if (!crashed && (thread == null) && !seeding.isDuringSeeding())
{
debug("Starting up.");
// Thread.dumpStack();
finished = false;
thread = new Thread(this, "InfoCollector");
//ThreadDebugger.registerMyself(thread);
Generic.setPriority(PRIORITY);
thread.start();
}
else
{
unpause();
}
}
}
public synchronized void pause()
{
if (thread != null)
{
debug("pause()");
running = false;
}
}
public synchronized void unpause()
{
// if ((thread != null) && !running && downloadLinksAutomatically)
if ((thread != null) && downloadLinksAutomatically.value() && !seeding.isDuringSeeding())
{
debug("unpause()");
running = true;
notifyAll();
}
}
/**
*
*/
private void checkCandidatesParserResultsForBetterOutlinks()
{
synchronized(candidateDocumentClosuresPool)
{
WeightSet<DocumentClosure>[] candidateSet = (WeightSet<DocumentClosure>[]) candidateDocumentClosuresPool.getWeightSets();
int maxSize = candidateDocumentClosuresPool.maxSize();
ArrayList<DocumentClosure> removeContainers = new ArrayList<DocumentClosure>(maxSize);
ArrayList<DocumentClosure> insertContainers = new ArrayList<DocumentClosure>(maxSize);
for(WeightSet<DocumentClosure> candidates : candidateSet)
{
for (DocumentClosure c : candidates)
{
Document ancestorDocument = c.getDocument().getAncestor();
if (ancestorDocument != null && !(ancestorDocument.isRecycled() /*|| ancestor.isRecycling() */))
{
RichDocumentParserCrawlerResult crawlerResult = (RichDocumentParserCrawlerResult) ancestorDocument.getParserResult();
if (crawlerResult != null)
{
DocumentClosure d = crawlerResult.swapNextBestOutlinkWith(c);
if (d != null && c != d)
{
removeContainers.add(c);
insertContainers.add(d);
}
}
}
}
int swapSize = insertContainers.size();
if(swapSize > 0)
System.out.println("Swapping Containers:\n\tReplacing " + swapSize);
for (DocumentClosure c : removeContainers)
candidates.remove(c);
removeContainers.clear();
// Insertion directly into weighset is Ok,
// because the replacement container will have the same generation
for (DocumentClosure c : insertContainers)
candidates.insert(c);
insertContainers.clear();
}
}
}
/**
* Blank implementation in base class.
*
* @param replaceMe TextClipping to remove
*/
public void removeTextClippingFromPools(GenericElement<TextClipping> replaceMe)
{
}
/**
* Blank implementation in base class.
*
* @param replaceMe Image to Remove
*/
public void removeImageClippingFromPools(DocumentClosure replaceMe)
{
}
/**
* Always return 0 here in the base class.
*/
public int imagePoolsSize()
{
return 0;
}
/**
* Used to assess how much need we have for more TextClippings.
*
* @return false in base class implementation.
*/
public boolean candidateTextClippingsSetIsAlmostEmpty()
{
return false;
}
/**
* Collects TextClipping based on its weight and if it is the first representative for that CompoundDocument.
* @param numSurrogatesCollectedFromCompoundDocument
* @param clippingPoolPriority TODO
* @param textClipping TextClipping to potentially collect
*
* @return always false in this base class implementation, because we do not collect TextClippings.
*/
public boolean collectTextClippingIfWorthwhile(GenericElement<TextClipping> textClippingGE, int numSurrogatesCollectedFromCompoundDocument, int clippingPoolPriority)
{
return false;
}
/**
* This is an Observer of changes in the TermVectors, which change when the interest model changes.
*
* When the interest model changes, we iterate through candidate DocumentClosures to see if they have a better link
* to contribute to our global crawler state.
*/
@Override
public void update(Observable o, Object arg)
{
checkCandidatesParserResultsForBetterOutlinks();
}
public void increaseNumImageReferences()
{
// TODO Auto-generated method stub
}
public void decreaseNumImageReferences()
{
// TODO Auto-generated method stub
}
/**
* Base class implementation does nothing.
* @param textClippingGE
* @param poolPriority
*/
public void addTextClippingToPool(GenericElement<TextClipping> textClippingGE, int poolPriority)
{
}
public boolean addDocumentToPool(Document document)
{
return addClosureToPool(document.getOrConstructClosure());
}
public boolean addClosureToPool(DocumentClosure candidate)
{
if (candidate != null)
{
synchronized(candidate)
{
if (candidate.isUnprocessed() && !candidate.isSeed())
{
Document document = candidate.getDocument();
if (!exceedsLinkCountThresholds(document))
{
int generation = document.getEffectiveGeneration();
int maxPoolNum = candidateDocumentClosuresPool.numWeightSets() - 1;
if (generation > maxPoolNum)
generation = maxPoolNum;
debugT("---\t---\t---\tAdding Container to candidateContainersPool: " + candidate
/* + " ancestor=[" + candidate.ancestor() + "]" */);
candidateDocumentClosuresPool.insert(candidate, generation);
return true;
}
}
}
}
return false;
}
// public void removeCandidateContainer(DocumentClosure candidate)
// {
// if (candidate != null && !candidate.downloadHasBeenQueued() )
// {
// candidateContainersPool.remove(candidate);
// }
// }
/**
* Determines whether a given <code>Document</code> exceeds the <code>linkCount</code> thresholds.
* Crawling too many links without seeing that the user is interested tends to lead to noisy content.
* <p/>
* Current thresholds are as follows:
* <ul>
* <li><code>linkCount</code> of 2 if link is to a new domain</li>
* <li><code>linkCount</code> of 4 if link is to the same domain</li>
* </ul>
* These thresholds are overridden if the user has expressed interest in surrogates from this particular <code>Container</code>.
*
* @param document the <code>Container</code> to evaluate for thresholds
* @return <code>true</code> if the element's <code>linkCount</code> exceeds thresholds
*/
public boolean exceedsLinkCountThresholds(Document document)
{
// debug("---------exceedsLinkCountThresholds---------");
int linkCount = document.getGeneration();
boolean sameDomainAsPrevious = document.isSameDomainAsPrevious();
short participantInterestIntensity = InterestModel.getInterestExpressedInTermVector(document.termVector());
if (linkCount <= 2)
{
// debug("---ACCEPT1: intensity: " + participantInterestIntensity + " link count: " + linkCount + " same domain: " + sameDomainAsPrevious);
return false;
}
else if (linkCount <= 4 && sameDomainAsPrevious)
{
// debug("---ACCEPT2: intensity: " + participantInterestIntensity + " link count: " + linkCount + " same domain: " + sameDomainAsPrevious);
return false;
}
else if (participantInterestIntensity > 0) // TODO: make sure participant interest is being kept up
{
// debug("---ACCEPT3: intensity: " + participantInterestIntensity + " link count: " + linkCount + " same domain: " + sameDomainAsPrevious);
return false;
}
else
{
//debug("--- exceedsLinkCountThresholds() REJECT : intensity: " + participantInterestIntensity + " link count: " + linkCount + " same domain: " + sameDomainAsPrevious);
// System.out.println("Ancestors Interest = " + element.ancestor().participantInterest().intensity() );
return true;
}
}
/**
* Message to the user when the crawler stops because there's no place
* left to crawl to.
*/
static final String TRAVERSAL_COMPLETE_MSG =
"The web crawler is pausing:\ntraversal of the information space is complete.\nThere are no more traversable pages to crawl to.";
boolean threadsArePaused;
boolean collectingThreadsPaused;
/**
* Pause all the threads we know about.
*
* @return true if threads exist and were not paused.
* Enables paused-ness state to be restored.
*/
@Override
public boolean pauseThreads()
{
boolean needToPause = (thread != null) && !threadsArePaused;
if (needToPause)
{
threadsArePaused = true;
//debug("Container.pauseThreads()");
InteractiveSpace interactiveSpace = semanticsSessionScope.getInteractiveSpace();
if (interactiveSpace != null)
interactiveSpace.pauseIfPlaying();
pauseThreadsExceptCompositionAgent();
if (interactiveSpace != null)
interactiveSpace.waitIfPlaying();
}
return needToPause;
}
boolean threadsExceptCompositionArePause;
/**
*
*/
public boolean pauseThreadsExceptCompositionAgent()
{
boolean needToPause = (thread != null) && !threadsExceptCompositionArePause;
if (needToPause)
{
pause();
InteractiveSpace interactiveSpace = semanticsSessionScope.getInteractiveSpace();
if (interactiveSpace != null)
interactiveSpace.pausePipeline();
pauseImageCollecting();
semanticsSessionScope.getDownloadMonitors().pauseRegularDownloadMonitors();
threadsExceptCompositionArePause = true;
}
return needToPause;
}
public void pauseCollectingThreads()
{
if ((thread != null) && !collectingThreadsPaused)
{
collectingThreadsPaused = true;
pauseImageCollecting();
pause();
}
}
public void unpauseCollectingThreads()
{
if (collectingThreadsPaused)
{
collectingThreadsPaused = false;
unpauseImageCollecting();
unpause();
}
}
/**
* Pause the candidate Images collecting thread.
*
* Base class implementation does nothing.
*/
protected void pauseImageCollecting()
{
}
/**
* Unpause the candidate Images collecting thread.
*
* Base class implementation does nothing.
*/
protected void unpauseImageCollecting()
{
}
public String getThreadStatuses()
{
return "ThreadsPaused = AllThreads("+ threadsArePaused + ") " +
"Collect(" + collectingThreadsPaused + ") " +
"NonComposition(" + threadsExceptCompositionArePause + ")";
}
/**
* Unpause (continue) all the threads we know about.
*/
@Override
public void unpauseThreads()
{
if (threadsArePaused)
{
threadsArePaused = false;
//debug("Container.unpauseThreads() crawlerDownloadMonitor waitingToDownload="+
// crawlerDownloadMonitor.waitingToDownload());
InteractiveSpace interactiveSpace = semanticsSessionScope.getInteractiveSpace();
if (interactiveSpace != null)
{
interactiveSpace.restorePlayIfWasPlaying();
interactiveSpace.unpausePipeline();
}
unpauseNonCompositionThreads();
}
else if(threadsExceptCompositionArePause)
{
unpauseNonCompositionThreads();
}
}
private void unpauseNonCompositionThreads()
{
threadsExceptCompositionArePause = false;
unpauseImageCollecting();
semanticsSessionScope.getDownloadMonitors().unpauseRegularDownloadMonitors();
unpause();
}
public void stop()
{
stop(false);
}
public void stop(boolean kill)
{
if (!finished)
finished = true;
stopCollectingAgents(kill);
semanticsSessionScope.getDownloadMonitors().stop(kill);
// clear all the collections when the CF browser exits -- eunyee
//ThreadDebugger.clear();
clearCollections();
}
/**
* Clear the candidateDocumentClosuresPool.
*
*/
public void clearCollections()
{
candidateDocumentClosuresPool.clear();
}
public boolean isOn()
{
return running && downloadLinksAutomatically.value();
}
/**
* Crawler
*/
@Override
public void run()
{
// TODO Auto-generated method stub
}
// ------------------- Thread related state handling ------------------- //
//
protected synchronized void waitIfNotRunning()
{
if (!running || !downloadLinksAutomatically.value())
{
try
{
debug("waitIfOff() waiting");
wait();
running = true;
}
catch (InterruptedException e)
{
debug("run(): wait interrupted: ");
e.printStackTrace();
Thread.interrupted(); // clear the interrupt
}
}
}
/**
* @return the seeding
*/
public Seeding getSeeding()
{
return seeding;
}
/**
* @param seeding the seeding to set
*/
public void setSeeding(Seeding seeding)
{
this.seeding = seeding;
}
/**
* Construct a CompoundDocument ParserResult object of type that matches this crawler.
*
* @param compoundDocument Document that is parsed.
* @param justCrawl True if we should not collect Images and TextClippings, even if we could.
*
* @return CompoundDocumentParserCrawlerResult
*/
public RichDocumentParserCrawlerResult
constructRichDocumentParserResult(RichDocument compoundDocument, boolean justCrawl)
{
return new RichDocumentParserCrawlerResult(compoundDocument);
}
public void killSite(final SemanticsSite site)
{
ArrayList<DocumentClosure> removalSet = new ArrayList<DocumentClosure>();
int poolNum = 0;
for(WeightSet<DocumentClosure> set : candidateDocumentClosuresPool.getWeightSets())
{
removalSet.clear();
for(DocumentClosure documentClosure : set)
if(documentClosure.isFromSite(site))
removalSet.add(documentClosure);
if(removalSet.size() > 0)
{
debug("Removing " + removalSet.size() + " candidate documentClosures from " + set);
for(DocumentClosure toRemove : removalSet)
set.remove(toRemove);
}
else
debug("No DocumentClosures to remove from poolNum: " + poolNum++ + " :" + set);
}
}
}