Crawler.java example

Explorer
BigSemanticsJava-master
/**
 * 
 */
package ecologylab.bigsemantics.collecting;

import java.util.ArrayList;
import java.util.Observable;
import java.util.Observer;

import ecologylab.appframework.types.prefs.PrefBoolean;
import ecologylab.bigsemantics.documentparsers.RichDocumentParserCrawlerResult;
import ecologylab.bigsemantics.gui.InteractiveSpace;
import ecologylab.bigsemantics.metadata.builtins.RichDocument;
import ecologylab.bigsemantics.metadata.builtins.Document;
import ecologylab.bigsemantics.metadata.builtins.DocumentClosure;
import ecologylab.bigsemantics.metadata.builtins.TextClipping;
import ecologylab.bigsemantics.model.text.InterestModel;
import ecologylab.bigsemantics.model.text.TermVector;
import ecologylab.bigsemantics.model.text.TermVectorWeightStrategy;
import ecologylab.bigsemantics.seeding.SemanticsPrefs;
import ecologylab.collections.GenericElement;
import ecologylab.collections.PrioritizedPool;
import ecologylab.collections.WeightSet;
import ecologylab.generic.Debug;
import ecologylab.generic.Generic;
import ecologylab.generic.ThreadMaster;

/**
 * Crawler will encapsulate all state regarding web crawling. 
 * This means WeightSets and PriorizedPools of candidate unparsed simple Document + CompoundDocument objects.
 * 
 * @author andruid
 */
public class Crawler extends Debug
implements Observer, ThreadMaster, Runnable, SemanticsPrefs
{
	protected SemanticsSessionScope						semanticsSessionScope;
	
	Seeding																		seeding;
	
	/**
	 * Priority to run at when we seem to have an overabundance of <code>MediaElement</code>s ready
	 * for display.
	 */
	protected static final int									LO_PRIORITY										= 3;

	/**
	 * Priority to run at normally.
	 */
	protected static final int									MID_PRIORITY									= 4;

	/**
	 * Priority to run at when we do not have enough <code>MediaElement</code>s ready for display.
	 */
	protected static final int									HI_PRIORITY										= 5;

	/**
	 * Initial priority, which is {@link #LO_PRIORITY LO_PRIORITY}, because we expect to need to do a
	 * bunch of crawling at the start, since our global collections will be empty.
	 */
	protected static final int									PRIORITY											= MID_PRIORITY;

	/**
	 * A constant for initializing our large HashTables.
	 */
	static final float								HASH_LOAD				= .5f;

	/**
	 * When the candidateContainersPool has more entries than this, it will be pruned.
	 */
	// static final int MAX_PAGES = 2048;
	static final int									MAX_PAGES				= 4096;																			// 3072;

	public static final int			NUM_GENERATIONS_IN_CONTAINER_POOL = 5; 
	public static final int			MAX_PAGES_PER_GENERATION	= MAX_PAGES / NUM_GENERATIONS_IN_CONTAINER_POOL ;

	private final PrioritizedPool<DocumentClosure>			candidateDocumentClosuresPool;
	

	/**
	 * true when the session is ending.
	 */
	protected boolean														finished;

	/**
	 * Controls whether or not the crawler is (temporarily) paused.
	 */
	protected boolean														running												= true;
	
	boolean																			crashed;

	/**
	 * Web crawler sleep time when we are in need of collecting more media.
	 */
	protected final int													usualSleep;

	/**
	 * Web crawler sleep time when there seems to be plenty of media already collected.
	 */
	protected final int													longSleep;

	/**
	 * Controls whether or not we periodically automatically download the DocumentClosures associated with
	 * outlinks that have been discovered. In other words, controls whether or not we crawl at all. Set
	 * as a preference at runtime, and via a menu entry.
	 */
	protected PrefBoolean												downloadLinksAutomatically		= CRAWL;

	// +++++++++++ state for thread +++++++++++ //
	/**
	 * The web crawler thread.
	 */
	protected Thread														thread;

	/**
	 * The number of loops through the web crawler. A performance statistic that roughly corresponds
	 * to how many new <code>Container</code>s have been queued for download and parse, during this
	 * session.
	 */
	protected int																count;

	protected boolean														collectingImages;
	
	public boolean isCollectingImages()
	{
		return collectingImages;
	}

	public boolean isCollectingText()
	{
		return collectingText;
	}
	protected boolean														collectingText;

	/**
	 * 
	 */
	public Crawler()
	{
		TermVector piv 										= InterestModel.getPIV(); 
		piv.addObserver(this);
		
		WeightSet<DocumentClosure>[] documentClosureWeightSets = new WeightSet[NUM_GENERATIONS_IN_CONTAINER_POOL];
		
		for(int i = 0; i < NUM_GENERATIONS_IN_CONTAINER_POOL; i++)
				documentClosureWeightSets[i] = new WeightSet<DocumentClosure>(MAX_PAGES_PER_GENERATION, this, 
					(TermVectorWeightStrategy) new DownloadContainerWeightingStrategy(piv));
		candidateDocumentClosuresPool 	= new PrioritizedPool<DocumentClosure>(documentClosureWeightSets);
		
		
		finished		= false;
		usualSleep	= 3000;
		longSleep		= usualSleep * 5 / 2;

	}

	/**
	 * Stop the threads that are responsible for collecting new surrogates.
	 * Includes the candidateImageVisualsPool, crawler and seeding download monitors.
	 * @param kill
	 */
	public void stopCollectingAgents(boolean kill)
	{
		semanticsSessionScope.getDownloadMonitors().stop(kill);
	}
	final Object startCrawlerSemaphore	= new Object();
	//FIXME
	public synchronized void start()
	{
		if (downloadLinksAutomatically.value())
		{
			if (!crashed && (thread == null) && !seeding.isDuringSeeding())
			{
				debug("Starting up.");
				//	 Thread.dumpStack();
				finished	= false;
				thread = new Thread(this, "InfoCollector");
				//ThreadDebugger.registerMyself(thread);	
				Generic.setPriority(PRIORITY);
				thread.start();
			}
			else
			{
				unpause();
			}
		}
	}

	public synchronized void pause()
	{
		if (thread != null)
		{
			debug("pause()");
			running = false;
		}
	}

	public synchronized void unpause()
	{
//		if ((thread != null) && !running && downloadLinksAutomatically)
		if ((thread != null) && downloadLinksAutomatically.value() && !seeding.isDuringSeeding())
		{
			debug("unpause()");
			running	= true;
			notifyAll();
		}
	}

	/**
	 * 
	 */
	private void checkCandidatesParserResultsForBetterOutlinks()
	{
		synchronized(candidateDocumentClosuresPool)
		{
			WeightSet<DocumentClosure>[] candidateSet = (WeightSet<DocumentClosure>[]) candidateDocumentClosuresPool.getWeightSets();
			int maxSize	= candidateDocumentClosuresPool.maxSize();
			ArrayList<DocumentClosure> removeContainers = new ArrayList<DocumentClosure>(maxSize);
			ArrayList<DocumentClosure> insertContainers = new ArrayList<DocumentClosure>(maxSize);
			for(WeightSet<DocumentClosure> candidates : candidateSet)
			{
				for (DocumentClosure c : candidates)
				{
					Document ancestorDocument	= c.getDocument().getAncestor();
					if (ancestorDocument != null && !(ancestorDocument.isRecycled() /*|| ancestor.isRecycling() */))
					{
						RichDocumentParserCrawlerResult crawlerResult	= (RichDocumentParserCrawlerResult) ancestorDocument.getParserResult();
						if (crawlerResult != null)
						{
							DocumentClosure d = crawlerResult.swapNextBestOutlinkWith(c);
							if (d != null && c != d)
							{
								removeContainers.add(c);
								insertContainers.add(d); 
							}
						}
					}
				}
				int swapSize = insertContainers.size();
				if(swapSize > 0)
					System.out.println("Swapping Containers:\n\tReplacing " + swapSize);
				for (DocumentClosure c : removeContainers)
					candidates.remove(c);
				removeContainers.clear();
				// Insertion directly into weighset is Ok, 
				// because the replacement container will have the same generation
				for (DocumentClosure c : insertContainers)
					candidates.insert(c);	
				insertContainers.clear();
			}
		}
	}
	
	/**
	 * Blank implementation in base class.
	 * 
	 * @param replaceMe		TextClipping to remove
	 */
	public void removeTextClippingFromPools(GenericElement<TextClipping> replaceMe)
	{
		
	}

	/**
	 * Blank implementation in base class.
	 * 
	 * @param replaceMe		Image to Remove
	 */
	public void removeImageClippingFromPools(DocumentClosure replaceMe)
	{
		
	}

	/**
	 * Always return 0 here in the base class.
	 */
	public int imagePoolsSize()
	{
		return 0;
	}

	/**
	 * Used to assess how much need we have for more TextClippings.
	 * 
	 * @return	false	in base class implementation.
	 */
	public boolean candidateTextClippingsSetIsAlmostEmpty()
	{
		return false;
	}
	
	/**
	 * Collects TextClipping based on its weight and if it is the first representative for that CompoundDocument.
	 * @param numSurrogatesCollectedFromCompoundDocument	
	 * @param clippingPoolPriority TODO
	 * @param textClipping	TextClipping to potentially collect
	 * 
	 * @return	always false in this base class implementation, because we do not collect TextClippings.
	 */
	public boolean collectTextClippingIfWorthwhile(GenericElement<TextClipping> textClippingGE, int numSurrogatesCollectedFromCompoundDocument, int clippingPoolPriority)
	{
		return false;
	}

	/**
	 * This is an Observer of changes in the TermVectors, which change when the interest model changes.
	 * 
	 * When the interest model changes, we iterate through candidate DocumentClosures to see if they have a better link
	 * to contribute to our global crawler state.
	 */
	@Override
	public void update(Observable o, Object arg)
	{
		checkCandidatesParserResultsForBetterOutlinks();
	}
	public void increaseNumImageReferences()
	{
		// TODO Auto-generated method stub
		
	}

	public void decreaseNumImageReferences()
	{
		// TODO Auto-generated method stub
		
	}

	/**
	 * Base class implementation does nothing.
	 * @param textClippingGE
	 * @param poolPriority
	 */
	public void addTextClippingToPool(GenericElement<TextClipping> textClippingGE, int poolPriority)
	{
	
	}
	
	public boolean addDocumentToPool(Document document)
	{
		return addClosureToPool(document.getOrConstructClosure());
	}
	public boolean addClosureToPool(DocumentClosure candidate)
	{
		if (candidate != null)
		{	
			synchronized(candidate)
			{
				if (candidate.isUnprocessed() && !candidate.isSeed())
				{
					Document document	= candidate.getDocument();
					if (!exceedsLinkCountThresholds(document))
					{
						int generation 		= document.getEffectiveGeneration();
						int maxPoolNum	= candidateDocumentClosuresPool.numWeightSets() - 1;
						if (generation > maxPoolNum)
							generation		= maxPoolNum;
						
						debugT("---\t---\t---\tAdding Container to candidateContainersPool: " + candidate 
								/* +  " ancestor=[" + candidate.ancestor() + "]" */);					
						
						candidateDocumentClosuresPool.insert(candidate, generation);
						return true;
					}
				}
			}
		}
		return false;
	}
	
//	public void removeCandidateContainer(DocumentClosure candidate)
//	{
//		if (candidate != null && !candidate.downloadHasBeenQueued() )
//		{
//			candidateContainersPool.remove(candidate);
//		}
//	}

	/**
	 * Determines whether a given <code>Document</code> exceeds the <code>linkCount</code> thresholds.
	 * Crawling too many links without seeing that the user is interested tends to lead to noisy content.
	 * <p/>
	 * Current thresholds are as follows:
	 * <ul>
	 * <li><code>linkCount</code> of 2 if link is to a new domain</li>
	 * <li><code>linkCount</code> of 4 if link is to the same domain</li>
	 * </ul>
	 * These thresholds are overridden if the user has expressed interest in surrogates from this particular <code>Container</code>.
	 *
	 * @param		document		the <code>Container</code> to evaluate for thresholds
	 * @return				<code>true</code> if the element's <code>linkCount</code> exceeds thresholds
	 */
	public boolean exceedsLinkCountThresholds(Document document)
	{
//		debug("---------exceedsLinkCountThresholds---------");

		int					linkCount			= document.getGeneration();
		boolean sameDomainAsPrevious 				= document.isSameDomainAsPrevious();
		short participantInterestIntensity 		= InterestModel.getInterestExpressedInTermVector(document.termVector());

		if (linkCount <= 2)
		{
//			debug("---ACCEPT1: intensity: " + participantInterestIntensity + " link count: " + linkCount + " same domain: " + sameDomainAsPrevious);
			return false;
		}
		else if (linkCount <= 4 && sameDomainAsPrevious)
		{
//			debug("---ACCEPT2: intensity: " + participantInterestIntensity + " link count: " + linkCount + " same domain: " + sameDomainAsPrevious);
			return false;
		}
		else if (participantInterestIntensity > 0)					// TODO: make sure participant interest is being kept up
		{
//			debug("---ACCEPT3: intensity: " + participantInterestIntensity + " link count: " + linkCount + " same domain: " + sameDomainAsPrevious);
			return false;
		}
		else
		{
			//debug("--- exceedsLinkCountThresholds() REJECT : intensity: " + participantInterestIntensity + " link count: " + linkCount + " same domain: " + sameDomainAsPrevious);
//			System.out.println("Ancestors Interest = " + element.ancestor().participantInterest().intensity() );
			return true;
		}
	}

	
	
	/**
	 * Message to the user when the crawler stops because there's no place
	 * left to crawl to.
	 */
	static final String TRAVERSAL_COMPLETE_MSG =
		"The web crawler is pausing:\ntraversal of the information space is complete.\nThere are no more traversable pages to crawl to.";
	
	boolean threadsArePaused;
	boolean collectingThreadsPaused;   
	/**
	 * Pause all the threads we know about.
	 * 
	 * @return	true if threads exist and were not paused.
	 * 			Enables paused-ness state to be restored.
	 */
	@Override
	public boolean pauseThreads()
	{
		boolean needToPause	= (thread != null) && !threadsArePaused;
		if (needToPause)
		{
			threadsArePaused	= true;
			//debug("Container.pauseThreads()");
			InteractiveSpace interactiveSpace	= semanticsSessionScope.getInteractiveSpace();

			if (interactiveSpace != null)
				interactiveSpace.pauseIfPlaying();
			pauseThreadsExceptCompositionAgent();
			if (interactiveSpace != null)
				interactiveSpace.waitIfPlaying();
		}
		return needToPause;
	}

	boolean	threadsExceptCompositionArePause;
	/**
	 * 
	 */
	public boolean pauseThreadsExceptCompositionAgent()
	{
		boolean needToPause	= (thread != null) && !threadsExceptCompositionArePause;
		if (needToPause)
		{
			pause();
			InteractiveSpace interactiveSpace	= semanticsSessionScope.getInteractiveSpace();

			if (interactiveSpace != null)
				interactiveSpace.pausePipeline();
			pauseImageCollecting();
			
			semanticsSessionScope.getDownloadMonitors().pauseRegularDownloadMonitors();

			threadsExceptCompositionArePause = true;
		}
		return needToPause;
	}
	public void pauseCollectingThreads()
	{
		if ((thread != null) && !collectingThreadsPaused)
		{
			collectingThreadsPaused	= true;
			pauseImageCollecting();
			pause();
		}
	}   
	public void unpauseCollectingThreads()
	{
		if (collectingThreadsPaused)
		{
			collectingThreadsPaused	= false;
			unpauseImageCollecting();
			unpause();
		}
	}
	
	/**
	 * Pause the candidate Images collecting thread.
	 * 
	 * Base class implementation does nothing.
	 */
	protected void pauseImageCollecting()
	{
		
	}
	
	/**
	 * Unpause the candidate Images collecting thread.
	 * 
	 * Base class implementation does nothing.
	 */
	protected void unpauseImageCollecting()
	{
		
	}
	
	public String getThreadStatuses()
	{
		return "ThreadsPaused = AllThreads("+ threadsArePaused +  ") " +
										"Collect(" + collectingThreadsPaused + ") " + 
										"NonComposition(" + threadsExceptCompositionArePause + ")";
	}
	/**
	 * Unpause (continue) all the threads we know about.
	 */
	@Override
	public void unpauseThreads()
	{
		if (threadsArePaused)
		{
			threadsArePaused	= false;
			//debug("Container.unpauseThreads() crawlerDownloadMonitor waitingToDownload="+
			//      crawlerDownloadMonitor.waitingToDownload());
			
			InteractiveSpace interactiveSpace	= semanticsSessionScope.getInteractiveSpace();
			if (interactiveSpace != null)
			{
				interactiveSpace.restorePlayIfWasPlaying();
				interactiveSpace.unpausePipeline();
			}
			
			unpauseNonCompositionThreads();
		}
		else if(threadsExceptCompositionArePause)
		{
			unpauseNonCompositionThreads();
		}
	}

	private void unpauseNonCompositionThreads()
	{
		threadsExceptCompositionArePause = false;

		unpauseImageCollecting();
		
		semanticsSessionScope.getDownloadMonitors().unpauseRegularDownloadMonitors();
		unpause();
	}
	public void stop()
	{
		stop(false);
	}
	public void stop(boolean kill)
	{
		if (!finished)
			finished	= true;

		stopCollectingAgents(kill);
		
		semanticsSessionScope.getDownloadMonitors().stop(kill);

		// clear all the collections when the CF browser exits -- eunyee
		//ThreadDebugger.clear();
		clearCollections();
	}
	
	/**
	 * Clear the candidateDocumentClosuresPool.
	 * 
	 */
	public void clearCollections()
	{
		candidateDocumentClosuresPool.clear();
	}

	public boolean isOn()
	{
		return running && downloadLinksAutomatically.value();
	}

	/**
	 * Crawler
	 */
	@Override
	public void run()
	{
		// TODO Auto-generated method stub
		
	}
	// ------------------- Thread related state handling ------------------- //
	//
	protected synchronized void waitIfNotRunning()
	{
		if (!running || !downloadLinksAutomatically.value())
		{
			try
			{
				debug("waitIfOff() waiting");
				wait();
				running = true;
			}
			catch (InterruptedException e)
			{
				debug("run(): wait interrupted: ");
				e.printStackTrace();
				Thread.interrupted(); // clear the interrupt
			}
		}
	}

	/**
	 * @return the seeding
	 */
	public Seeding getSeeding()
	{
		return seeding;
	}

	/**
	 * @param seeding the seeding to set
	 */
	public void setSeeding(Seeding seeding)
	{
		this.seeding = seeding;
	}

	/**
	 * Construct a CompoundDocument ParserResult object of type that matches this crawler.
	 * 
	 * @param compoundDocument	Document that is parsed.
	 * @param justCrawl					True if we should not collect Images and TextClippings, even if we could.
	 * 
	 * @return	CompoundDocumentParserCrawlerResult
	 */
	public RichDocumentParserCrawlerResult 
	constructRichDocumentParserResult(RichDocument compoundDocument, boolean justCrawl)
	{
		return new RichDocumentParserCrawlerResult(compoundDocument);
	}
	
	public void killSite(final SemanticsSite site)
	{
		ArrayList<DocumentClosure> removalSet = new ArrayList<DocumentClosure>();
		int poolNum = 0;
		for(WeightSet<DocumentClosure> set : candidateDocumentClosuresPool.getWeightSets())
		{
			removalSet.clear();
			for(DocumentClosure documentClosure : set)
				if(documentClosure.isFromSite(site))
					removalSet.add(documentClosure);
			if(removalSet.size() > 0)
			{
				debug("Removing " + removalSet.size() + " candidate documentClosures from " + set);
				for(DocumentClosure toRemove : removalSet)
					set.remove(toRemove);
			}
			else
				debug("No DocumentClosures to remove from poolNum: " + poolNum++ + " :" + set);
		}
	}
}