/**
*
*/
package ecologylab.bigsemantics.collecting;
import java.util.ArrayList;
import java.util.Observable;
import ecologylab.bigsemantics.collecting.Crawler;
import ecologylab.bigsemantics.collecting.SemanticsSite;
import ecologylab.bigsemantics.documentparsers.RichDocumentParserCrawlerResult;
import ecologylab.bigsemantics.gui.InteractiveSpace;
import ecologylab.bigsemantics.metadata.builtins.RichDocument;
import ecologylab.bigsemantics.metadata.builtins.Document;
import ecologylab.bigsemantics.metadata.builtins.DocumentClosure;
import ecologylab.bigsemantics.metadata.builtins.Image;
import ecologylab.bigsemantics.metadata.builtins.TextClipping;
import ecologylab.bigsemantics.model.text.InterestModel;
import ecologylab.bigsemantics.model.text.TermVector;
import ecologylab.bigsemantics.model.text.TermVectorWeightStrategy;
import ecologylab.collections.GenericElement;
import ecologylab.collections.GenericPrioritizedPool;
import ecologylab.collections.GenericWeightSet;
import ecologylab.collections.PrioritizedPool;
import ecologylab.collections.WeightSet;
/**
* Adds collecting of ImageClippings and TextClippings to basic Crawler.
*
* @author andruid
*/
public class ImageTextCrawler extends Crawler
{
private static final int STARVED_FOR_IMAGES_COUNT = 2;
/**
* When the {@link #candidateTextSet candidateTextSet} and the {@link #candidateImgSet
* candidateImgSet} have more entries than this, they will be pruned.
*/
static final int MAX_MEDIA = 3072;
public static final int NUM_GENERATIONS_IN_MEDIA_POOL = 3;
static final int MAX_MEDIA_PER_GENERATION = MAX_MEDIA / NUM_GENERATIONS_IN_MEDIA_POOL;
/**
* Contains 3 visual pools. The first holds the first image of each container
*/
//FIXME This should be GenericWeightSet<ImageClipping> in order to use the right metadata in TermVector !!!!!!!!!
private final PrioritizedPool<DocumentClosure> candidateImagesPool;
/**
* Contains 2 FloatWeightSet pools.
* The first holds the first text surrogate of each container
*/
private final GenericPrioritizedPool<TextClipping> candidateTextClippingElementsPool;
/**
*
*/
public ImageTextCrawler()
{
super();
collectingImages = true;
collectingText = true;
TermVector piv = InterestModel.getPIV();
//Similarly for text surrogates
GenericWeightSet[] textWeightSets = {
new GenericWeightSet<TextClipping>(MAX_MEDIA_PER_GENERATION, this, new TermVectorWeightStrategy(piv)),
new GenericWeightSet<TextClipping>(MAX_MEDIA_PER_GENERATION, this, new TermVectorWeightStrategy(piv)),
new GenericWeightSet<TextClipping>(MAX_MEDIA_PER_GENERATION, this, new TermVectorWeightStrategy(piv))
};
candidateTextClippingElementsPool = new GenericPrioritizedPool<TextClipping>(textWeightSets);
// Three pools for downloaded images
WeightSet<DocumentClosure>[] imageWeightSets = new WeightSet[NUM_GENERATIONS_IN_MEDIA_POOL];
for (int i = 0; i < NUM_GENERATIONS_IN_MEDIA_POOL; i++)
imageWeightSets[i] = new WeightSet<DocumentClosure>(MAX_MEDIA_PER_GENERATION, this, new TermVectorWeightStrategy(piv));
candidateImagesPool = new PrioritizedPool<DocumentClosure>(imageWeightSets);
}
@Override
public void stopCollectingAgents(boolean kill)
{
super.stopCollectingAgents(kill);
candidateImagesPool.stop();
}
/**
* Replace images in the candidates with possible better ones from their containers.
*/
private void checkCandidatesParserResultsForBetterImagesAndText()
{
synchronized (candidateImagesPool)
{
for (DocumentClosure imageClosure : candidateImagesPool)
{
//TODO -- check among all source documents!!!
Image image = (Image) imageClosure.getDocument();
Document sourceDocument = image.getClippingSource();
if (sourceDocument != null && sourceDocument.isRichDocument())
{
CompoundDocumentParserImageTextCrawlerResult crawlerResult = (CompoundDocumentParserImageTextCrawlerResult) sourceDocument.getParserResult();
if (crawlerResult != null)
{
crawlerResult.tryToGetBetterImageAfterInterestExpression(imageClosure);
}
}
}
}
synchronized (candidateTextClippingElementsPool)
{
for (GenericElement<TextClipping> textClippingElement : candidateTextClippingElementsPool)
{
TextClipping textClipping = textClippingElement.getGeneric();
Document sourceDocument = textClipping.getSourceDoc();
if (sourceDocument != null && sourceDocument.isRichDocument())
{
CompoundDocumentParserImageTextCrawlerResult crawlerResult = (CompoundDocumentParserImageTextCrawlerResult) sourceDocument.getParserResult();
if (crawlerResult != null)
{
crawlerResult.tryToGetBetterTextAfterInterestExpression(textClippingElement);
}
}
}
}
}
/**
* Remove from candidate clippings pool.
*
* @param replaceMe TextClipping to remove
*/
@Override
public void removeTextClippingFromPools(GenericElement<TextClipping> replaceMe)
{
candidateTextClippingElementsPool.remove(replaceMe);
}
/**
* Remove from candidate Images pool.
*
* @param replaceMe Image to Remove
*/
@Override
public void removeImageClippingFromPools(DocumentClosure replaceMe)
{
candidateImagesPool.remove(replaceMe);
}
/**
* Number of display-able <code>Image</code>s that could be displayed.
*
* Used for deciding how urgent downloading Images is.
*/
@Override
public int imagePoolsSize()
{
return candidateImagesPool.size();
}
public static final int ALMOST_EMPTY_CANDIDATES_SET_THRESHOLD = 5;
/**
* Used to assess how much need we have for more TextClippings.
*/
@Override
public boolean candidateTextClippingsSetIsAlmostEmpty()
{
return candidateTextClippingElementsPool.size() <= ALMOST_EMPTY_CANDIDATES_SET_THRESHOLD;
}
/**
* Collects TextClipping based on its weight and if it is the first representative for that CompoundDocument.
* @param numSurrogatesCollectedFromCompoundDocument
* @param textClipping TextClipping to potentially collect
*
* @return always false in this base class implementation, because we do not collect TextClippings.
*/
@Override
public boolean collectTextClippingIfWorthwhile(GenericElement<TextClipping> textClippingGE, int numSurrogatesCollectedFromCompoundDocument, int clippingPoolPriority)
{
TextClipping textClipping = textClippingGE.getGeneric();
float adjustedWeight = InterestModel.getInterestExpressedInTermVector(textClipping.termVector()) / (float) numSurrogatesCollectedFromCompoundDocument;
float meanTxtSetWeight = candidateTextClippingsMean();
boolean result = (adjustedWeight >= meanTxtSetWeight) || candidateTextClippingsSetIsAlmostEmpty();
if (result)
{
addTextClippingToPool(textClippingGE, clippingPoolPriority);
}
return result;
}
public boolean collectImageIfWorthwhile(DocumentClosure imageClosure, int numSurrogatesCollectedFromCompoundDocument, int clippingPoolPriority)
{
boolean result = false;
if (imagePoolsSize() < STARVED_FOR_IMAGES_COUNT)
result = true;
if (!result)
{
float adjustedWeight = InterestModel.getInterestExpressedInTermVector(imageClosure.termVector()) / (float) numSurrogatesCollectedFromCompoundDocument;
float meanImagesWeight = candidateImagesMean();
result = adjustedWeight >= meanImagesWeight;
}
if (result)
{
addCandidateImage(imageClosure);
//FIXME How do we download images and dispatch them to the space!!!????
}
return result;
}
public void addCandidateImage(DocumentClosure imageClosure)
{
candidateImagesPool.add(imageClosure);
}
/**
*
* @return Weighted mean of the members of the candidateFirstTextElementsSet and the candidateTextElementsSet.
*/
public float candidateTextClippingsMean()
{
return candidateTextClippingElementsPool.mean();
}
public float candidateImagesMean()
{
return candidateImagesPool.mean();
}
/**
* This is an Observer of changes in the TermVectors, which change when the interest model changes.
*
* When the interest model changes, we iterate through candidate DocumentClosures to see if they have a better link
* to contribute to our global crawler state.
* We make the same checks for candidate Images and TextClippings.
*/
@Override
public void update(Observable o, Object arg)
{
super.update(o, arg);
checkCandidatesParserResultsForBetterImagesAndText();
}
/**
* Add a TextClipping into our pool of candidates.
*
* @param textClippingGE GenericElement that contains the TextClipping.
*
* @param poolPriority Pool priority shapes which level in the candidates pool to insert into.
*/
@Override
public void addTextClippingToPool(GenericElement<TextClipping> textClippingGE, int poolPriority)
{
candidateTextClippingElementsPool.insert(textClippingGE, poolPriority);
InteractiveSpace interactiveSpace = semanticsSessionScope.getInteractiveSpace();
if (seeding != null && seeding.isPlayOnStart() && interactiveSpace != null)
interactiveSpace.pressPlayWhenFirstMediaArrives();
}
/**
* Pause the candidate Images collecting thread.
*
* Base class implementation does nothing.
*/
@Override
protected void pauseImageCollecting()
{
candidateImagesPool.pause();
}
/**
* Unpause the candidate Images collecting thread.
*
* Base class implementation does nothing.
*/
@Override
protected void unpauseImageCollecting()
{
candidateImagesPool.unpause();
}
/**
* Clear the candidateImagesPool, and candidateTextClippingsPool.
* Call super() to clear the candidateDocumentClosuresPool.
*
*/
@Override
public void clearCollections()
{
candidateImagesPool.clear();
candidateTextClippingElementsPool.clear();
super.clearCollections();
}
/**
* Construct a CompoundDocument ParserResult object of type that matches this crawler.
*
* @param compoundDocument Document that is parsed.
* @param justCrawl True if we should not collect Images and TextClippings, even if we could.
*
* @return CompoundDocumentParserCrawlerResult
*/
@Override
public RichDocumentParserCrawlerResult
constructRichDocumentParserResult(RichDocument compoundDocument, boolean justCrawl)
{
return justCrawl ? new CompoundDocumentParserImageTextCrawlerResult(compoundDocument)
: super.constructRichDocumentParserResult(compoundDocument, justCrawl);
}
public void killSite(final SemanticsSite site)
{
super.killSite(site);
ArrayList<DocumentClosure> removalSet = new ArrayList<DocumentClosure>();
int poolNum = 0;
for(WeightSet<DocumentClosure> set : candidateImagesPool.getWeightSets())
{
removalSet.clear();
for(DocumentClosure documentClosure : set)
if(documentClosure.isFromSite(site))
removalSet.add(documentClosure);
if(removalSet.size() > 0)
{
debug("Removing " + removalSet.size() + " candidate images from " + set);
for(DocumentClosure toRemove : removalSet)
set.remove(toRemove);
}
else
debug("No Images to remove from poolNum: " + poolNum++ + " :" + set);
}
ArrayList<GenericElement<TextClipping>> textRemovalSet = new ArrayList<GenericElement<TextClipping>>();
// remove relevant text
for(WeightSet<GenericElement<TextClipping>> set : candidateTextClippingElementsPool.getWeightSets())
{
textRemovalSet.clear();
for (GenericElement<TextClipping> genericElement : set)
{
TextClipping textClipping = genericElement.getGeneric();
Document sourceDocument = textClipping.getSourceDoc();
if (sourceDocument.getSite() == site)
textRemovalSet.add(genericElement);
if(textRemovalSet.size() > 0)
{
debug("Removing " + removalSet.size() + " candidate text clippings from " + set);
for(GenericElement<TextClipping> toRemove : textRemovalSet)
set.remove(toRemove);
}
else
debug("No TextClippings to remove from poolNum: " + poolNum++ + " :" + set);
}
}
}
}