ImageTextCrawler.java example

Explorer
BigSemanticsJava-master
/**
 * 
 */
package ecologylab.bigsemantics.collecting;

import java.util.ArrayList;
import java.util.Observable;

import ecologylab.bigsemantics.collecting.Crawler;
import ecologylab.bigsemantics.collecting.SemanticsSite;
import ecologylab.bigsemantics.documentparsers.RichDocumentParserCrawlerResult;
import ecologylab.bigsemantics.gui.InteractiveSpace;
import ecologylab.bigsemantics.metadata.builtins.RichDocument;
import ecologylab.bigsemantics.metadata.builtins.Document;
import ecologylab.bigsemantics.metadata.builtins.DocumentClosure;
import ecologylab.bigsemantics.metadata.builtins.Image;
import ecologylab.bigsemantics.metadata.builtins.TextClipping;
import ecologylab.bigsemantics.model.text.InterestModel;
import ecologylab.bigsemantics.model.text.TermVector;
import ecologylab.bigsemantics.model.text.TermVectorWeightStrategy;
import ecologylab.collections.GenericElement;
import ecologylab.collections.GenericPrioritizedPool;
import ecologylab.collections.GenericWeightSet;
import ecologylab.collections.PrioritizedPool;
import ecologylab.collections.WeightSet;

/**
 * Adds collecting of ImageClippings and TextClippings to basic Crawler.
 * 
 * @author andruid
 */
public class ImageTextCrawler extends Crawler
{
	private static final int	STARVED_FOR_IMAGES_COUNT	= 2;

	/**
	 * When the {@link #candidateTextSet candidateTextSet} and the {@link #candidateImgSet
	 * candidateImgSet} have more entries than this, they will be pruned.
	 */
	static final int						MAX_MEDIA											= 3072;

	public static final int			NUM_GENERATIONS_IN_MEDIA_POOL = 3; 

	static final int						MAX_MEDIA_PER_GENERATION			= MAX_MEDIA / NUM_GENERATIONS_IN_MEDIA_POOL;

	/**
	 * Contains 3 visual pools. The first holds the first image of each container
	 */
	//FIXME This should be GenericWeightSet<ImageClipping> in order to use the right metadata in TermVector !!!!!!!!!
	private final PrioritizedPool<DocumentClosure> 				candidateImagesPool;
	
	/**
	 * Contains 2 FloatWeightSet pools. 
	 * The first holds the first text surrogate of each container
	 */
	private final GenericPrioritizedPool<TextClipping> 	candidateTextClippingElementsPool;
	

	/**
	 * 
	 */
	public ImageTextCrawler()
	{
		super();
		collectingImages									= true;
		collectingText										= true;
		
		TermVector piv 										= InterestModel.getPIV(); 

		//Similarly for text surrogates
		GenericWeightSet[] textWeightSets = { 
				new GenericWeightSet<TextClipping>(MAX_MEDIA_PER_GENERATION, this, new TermVectorWeightStrategy(piv)),
				new GenericWeightSet<TextClipping>(MAX_MEDIA_PER_GENERATION, this, new TermVectorWeightStrategy(piv)),
				new GenericWeightSet<TextClipping>(MAX_MEDIA_PER_GENERATION, this, new TermVectorWeightStrategy(piv))
		};
		candidateTextClippingElementsPool = new GenericPrioritizedPool<TextClipping>(textWeightSets);

		// Three pools for downloaded images      
		WeightSet<DocumentClosure>[] imageWeightSets	= new WeightSet[NUM_GENERATIONS_IN_MEDIA_POOL];
		for (int i = 0; i < NUM_GENERATIONS_IN_MEDIA_POOL; i++)
			imageWeightSets[i]	= new WeightSet<DocumentClosure>(MAX_MEDIA_PER_GENERATION, this, new TermVectorWeightStrategy(piv));
		candidateImagesPool = new PrioritizedPool<DocumentClosure>(imageWeightSets);
				
	}

	@Override
	public void stopCollectingAgents(boolean kill)
	{
		super.stopCollectingAgents(kill);

		candidateImagesPool.stop();
	}
	
	/**
	 * Replace images in the candidates with possible better ones from their containers.
	 */
	private void checkCandidatesParserResultsForBetterImagesAndText()
	{
		synchronized (candidateImagesPool)
		{
			for (DocumentClosure imageClosure : candidateImagesPool)
			{
				//TODO -- check among all source documents!!!
				Image image								= (Image) imageClosure.getDocument();
				Document sourceDocument		= image.getClippingSource();
				if (sourceDocument != null && sourceDocument.isRichDocument())
				{
					CompoundDocumentParserImageTextCrawlerResult crawlerResult	= (CompoundDocumentParserImageTextCrawlerResult) sourceDocument.getParserResult();
					if (crawlerResult != null)
					{
						crawlerResult.tryToGetBetterImageAfterInterestExpression(imageClosure);
					}
				}
			}
		}
		synchronized (candidateTextClippingElementsPool)
		{
			for (GenericElement<TextClipping> textClippingElement : candidateTextClippingElementsPool)
			{
				TextClipping textClipping	= textClippingElement.getGeneric();
				Document sourceDocument		= textClipping.getSourceDoc();
				if (sourceDocument != null && sourceDocument.isRichDocument())
				{
					CompoundDocumentParserImageTextCrawlerResult crawlerResult	= (CompoundDocumentParserImageTextCrawlerResult) sourceDocument.getParserResult();
					if (crawlerResult != null)
					{
						crawlerResult.tryToGetBetterTextAfterInterestExpression(textClippingElement);
					}
				}
			}
		}
	}

	/**
	 * Remove from candidate clippings pool.
	 * 
	 * @param replaceMe		TextClipping to remove
	 */
	@Override
	public void removeTextClippingFromPools(GenericElement<TextClipping> replaceMe)
	{
		candidateTextClippingElementsPool.remove(replaceMe);
	}
	/**
	 * Remove from candidate Images pool.
	 * 
	 * @param replaceMe		Image to Remove
	 */
	@Override
	public void removeImageClippingFromPools(DocumentClosure replaceMe)
	{
		candidateImagesPool.remove(replaceMe);
	}

	/**
	 * Number of display-able <code>Image</code>s that could be displayed.
	 * 
	 * Used for deciding how urgent downloading Images is.
	 */
	@Override
	public int imagePoolsSize()
	{
		return candidateImagesPool.size();
	}
	
	public static final int ALMOST_EMPTY_CANDIDATES_SET_THRESHOLD	= 5;
	
	/**
	 * Used to assess how much need we have for more TextClippings.
	 */
	@Override
	public boolean candidateTextClippingsSetIsAlmostEmpty()
	{
		return candidateTextClippingElementsPool.size() <= ALMOST_EMPTY_CANDIDATES_SET_THRESHOLD;
	}

	/**
	 * Collects TextClipping based on its weight and if it is the first representative for that CompoundDocument.
	 * @param numSurrogatesCollectedFromCompoundDocument	
	 * @param textClipping	TextClipping to potentially collect
	 * 
	 * @return	always false in this base class implementation, because we do not collect TextClippings.
	 */
	@Override
	public boolean collectTextClippingIfWorthwhile(GenericElement<TextClipping> textClippingGE, int numSurrogatesCollectedFromCompoundDocument, int clippingPoolPriority)
	{
		TextClipping textClipping	= textClippingGE.getGeneric();
		
		float adjustedWeight 		= InterestModel.getInterestExpressedInTermVector(textClipping.termVector()) / (float) numSurrogatesCollectedFromCompoundDocument;
		float meanTxtSetWeight	= candidateTextClippingsMean();
		boolean result = (adjustedWeight >= meanTxtSetWeight) || candidateTextClippingsSetIsAlmostEmpty();
		if (result)
		{
			addTextClippingToPool(textClippingGE, clippingPoolPriority);
		}
		return result;
	}

	public boolean collectImageIfWorthwhile(DocumentClosure imageClosure, int numSurrogatesCollectedFromCompoundDocument, int clippingPoolPriority)
	{
		boolean result	= false;
		if (imagePoolsSize() < STARVED_FOR_IMAGES_COUNT)
			result				= true;
		
		if (!result)
		{
			float adjustedWeight			= InterestModel.getInterestExpressedInTermVector(imageClosure.termVector()) / (float) numSurrogatesCollectedFromCompoundDocument;
			
			float meanImagesWeight		= candidateImagesMean();
			
			result										= adjustedWeight >= meanImagesWeight;
		}
		if (result)
		{
			addCandidateImage(imageClosure);
			
			//FIXME How do we download images and dispatch them to the space!!!????
		}
		return result;
	}

	public void addCandidateImage(DocumentClosure imageClosure)
	{
		candidateImagesPool.add(imageClosure);
	}

	/**
	 * 
	 * @return	Weighted mean of the members of the candidateFirstTextElementsSet and the candidateTextElementsSet.
	 */
	public float candidateTextClippingsMean()
	{
		return candidateTextClippingElementsPool.mean();
	}

	public float candidateImagesMean()
	{
		return candidateImagesPool.mean();
	}


	/**
	 * This is an Observer of changes in the TermVectors, which change when the interest model changes.
	 * 
	 * When the interest model changes, we iterate through candidate DocumentClosures to see if they have a better link
	 * to contribute to our global crawler state.
	 * We make the same checks for candidate Images and TextClippings.
	 */
	@Override
	public void update(Observable o, Object arg)
	{
		super.update(o, arg);
		checkCandidatesParserResultsForBetterImagesAndText();
	}

	/**
	 * Add a TextClipping into our pool of candidates.
	 * 
	 * @param textClippingGE	GenericElement that contains the TextClipping.
	 * 
	 * @param poolPriority		Pool priority shapes which level in the candidates pool to insert into.
	 */
	@Override
	public void addTextClippingToPool(GenericElement<TextClipping> textClippingGE, int poolPriority)
	{
		candidateTextClippingElementsPool.insert(textClippingGE, poolPriority);
		
		InteractiveSpace interactiveSpace	= semanticsSessionScope.getInteractiveSpace();
		if (seeding != null && seeding.isPlayOnStart() && interactiveSpace != null)
			interactiveSpace.pressPlayWhenFirstMediaArrives();
	}
	
	/**
	 * Pause the candidate Images collecting thread.
	 * 
	 * Base class implementation does nothing.
	 */
	@Override
	protected void pauseImageCollecting()
	{
		candidateImagesPool.pause();
	}
	
	/**
	 * Unpause the candidate Images collecting thread.
	 * 
	 * Base class implementation does nothing.
	 */
	@Override
	protected void unpauseImageCollecting()
	{
		candidateImagesPool.unpause();
	}
	
	/**
	 * Clear the candidateImagesPool, and candidateTextClippingsPool.
	 * Call super() to clear the candidateDocumentClosuresPool.
	 * 
	 */
	@Override
	public void clearCollections()
	{
		candidateImagesPool.clear();
		candidateTextClippingElementsPool.clear();
		super.clearCollections();
	}

	/**
	 * Construct a CompoundDocument ParserResult object of type that matches this crawler.
	 * 
	 * @param compoundDocument	Document that is parsed.
	 * @param justCrawl					True if we should not collect Images and TextClippings, even if we could.
	 * 
	 * @return	CompoundDocumentParserCrawlerResult
	 */
	@Override
	public RichDocumentParserCrawlerResult 
	constructRichDocumentParserResult(RichDocument compoundDocument, boolean justCrawl)
	{
		return justCrawl ? new CompoundDocumentParserImageTextCrawlerResult(compoundDocument) 
			: super.constructRichDocumentParserResult(compoundDocument, justCrawl);
	}

	public void killSite(final SemanticsSite site)
	{
		super.killSite(site);
		
		ArrayList<DocumentClosure> removalSet = new ArrayList<DocumentClosure>();
		int poolNum = 0;
		for(WeightSet<DocumentClosure> set : candidateImagesPool.getWeightSets())
		{
			removalSet.clear();
			for(DocumentClosure documentClosure : set)
				if(documentClosure.isFromSite(site))
					removalSet.add(documentClosure);
			if(removalSet.size() > 0)
			{
				debug("Removing " + removalSet.size() + " candidate images from " + set);
				for(DocumentClosure toRemove : removalSet)
					set.remove(toRemove);
			}
			else
				debug("No Images to remove from poolNum: " + poolNum++ + " :" + set);
		}
		
		ArrayList<GenericElement<TextClipping>> textRemovalSet = new ArrayList<GenericElement<TextClipping>>();
		// remove relevant text
		for(WeightSet<GenericElement<TextClipping>> set : candidateTextClippingElementsPool.getWeightSets())
		{
			textRemovalSet.clear();
			for (GenericElement<TextClipping> genericElement : set)
			{
				TextClipping textClipping	= genericElement.getGeneric();
				Document sourceDocument	= textClipping.getSourceDoc();
				if (sourceDocument.getSite() == site)
					textRemovalSet.add(genericElement);
				if(textRemovalSet.size() > 0)
				{
					debug("Removing " + removalSet.size() + " candidate text clippings from " + set);
					for(GenericElement<TextClipping> toRemove : textRemovalSet)
						set.remove(toRemove);
				}
				else
					debug("No TextClippings to remove from poolNum: " + poolNum++ + " :" + set);
			}
		}
	}

}