/** * */ package ecologylab.bigsemantics.collecting; import ecologylab.bigsemantics.documentparsers.RichDocumentParserCrawlerResult; import ecologylab.bigsemantics.metadata.builtins.Clipping; import ecologylab.bigsemantics.metadata.builtins.RichDocument; import ecologylab.bigsemantics.metadata.builtins.DocumentClosure; import ecologylab.bigsemantics.metadata.builtins.Image; import ecologylab.bigsemantics.metadata.builtins.ImageClipping; import ecologylab.bigsemantics.metadata.builtins.TextClipping; import ecologylab.bigsemantics.model.text.InterestModel; import ecologylab.bigsemantics.model.text.TermVectorWeightStrategy; import ecologylab.collections.GenericElement; import ecologylab.collections.GenericWeightSet; import ecologylab.collections.WeightSet; import ecologylab.generic.Continuation; /** * The per CompoundDocument component of the ImageTextCrawler. * * @author andruid */ public class CompoundDocumentParserImageTextCrawlerResult extends RichDocumentParserCrawlerResult<ImageTextCrawler> implements Continuation<DocumentClosure> { /** * Weighted collection of <code>ImageElement</code>s. * Contain elements that have not been transported to candidatePool. */ //FIXME This should be GenericWeightSet<ImageClipping> in order to use the right metadata in TermVector !!!!!!!!! private WeightSet<DocumentClosure> candidateImageClosures; /** * Weighted collection of <code>TextElement</code>s. * Contain elements that have not been transported to candidatePool. */ private GenericWeightSet<TextClipping> candidateTextClippings; private boolean crawlingImages; private boolean crawlingTextClippings; double mostRecentImageWeight = 0, mostRecentTextWeight = 0; /** * @param compoundDocument */ public CompoundDocumentParserImageTextCrawlerResult(RichDocument compoundDocument) { super(compoundDocument); } int sizeCandidateTextClippings() { return candidateTextClippings == null ? 0 : candidateTextClippings.size(); } int sizeCandidateImageClosures() { return candidateImageClosures == null ? 0 : candidateImageClosures.size(); } public int sizeLocalCandidates() { return sizeCandidateTextClippings() + sizeCandidateImageClosures(); } protected synchronized void perhapsAddTextClippingToCrawler() { GenericElement<TextClipping> textClippingGE = null; if (candidateTextClippings != null) { textClippingGE = candidateTextClippings.maxSelect(); } if( textClippingGE!=null ) { // If no surrogate has been delivered to the candidate pool from the container, // send it to the candidate pool without checking the media weight. if (firstClipping() ) crawler.addTextClippingToPool(textClippingGE, clippingPoolPriority()); else { if (crawler.collectTextClippingIfWorthwhile(textClippingGE, numSurrogatesFrom, clippingPoolPriority())) { mostRecentTextWeight = InterestModel.getInterestExpressedInTermVector(textClippingGE.getGeneric().termVector()); } else { textClippingGE.recycle(false); crawlingTextClippings = false; //recycle(false); } } } else crawlingTextClippings = false; } private boolean firstClipping() { return numSurrogatesFrom==0; } protected synchronized void perhapsAddImageClosureToCrawler() { DocumentClosure imageClosure = null; if (candidateImageClosures != null) imageClosure = candidateImageClosures.maxSelect(); if (imageClosure!=null && imageClosure.termVector() != null && !imageClosure.termVector().isRecycled()) { // If no surrogate has been delivered to the candidate pool from the container, // send it to the candidate pool without checking the media weight. if (firstClipping()) crawler.addCandidateImage(imageClosure); else { if (crawler.collectImageIfWorthwhile(imageClosure, numSurrogatesFrom, clippingPoolPriority())) { mostRecentImageWeight = InterestModel.getInterestExpressedInTermVector(imageClosure.termVector()); } else { //FIXME -- what about the ImageClipping?! imageClosure.recycle(false); crawlingImages = false; //recycle(false); } } } else crawlingImages = false; } @Override public void callback(DocumentClosure imageClosure) { mostRecentImageWeight = InterestModel.getInterestExpressedInTermVector(imageClosure.termVector()); perhapsAddImageClosureToCrawler(); } @Override protected void collect(Clipping clipping) { if (clipping.isImage()) { collect((ImageClipping) clipping); } else { // text clipping collect((TextClipping) clipping); } super.collect(clipping); } /** * Add an ImageClipping to our candidates collection. * * @param textClipping */ protected void collect(ImageClipping imageClipping) { if (candidateImageClosures == null) candidateImageClosures = new WeightSet<DocumentClosure>(new TermVectorWeightStrategy(InterestModel.getPIV())); Image media = imageClipping.getMedia(); if (media != null) { candidateImageClosures.insert(media.getOrConstructClosure()); } } /** * Add a TextClipping to our candidates collection. * * @param textClipping */ protected void collect(TextClipping textClipping) { if (candidateTextClippings == null) candidateTextClippings = new GenericWeightSet<TextClipping>(new TermVectorWeightStrategy(InterestModel.getPIV())) { @Override public boolean insert(TextClipping go) { return insert(new MetadataElement<TextClipping>(go)); } }; candidateTextClippings.insert(textClipping); } /** * Start up collecting loops -- TextClippings, Images, DocumentClosures. */ @Override protected void initiateCollecting() { if (crawler.isCollectingText()) { crawlingTextClippings = true; perhapsAddTextClippingToCrawler(); } if (crawler.isCollectingImages()) { crawlingImages = true; perhapsAddImageClosureToCrawler(); } super.initiateCollecting(); } public synchronized void tryToGetBetterTextAfterInterestExpression(GenericElement<TextClipping> replaceMe) { if (candidateTextClippings == null || candidateTextClippings.size() == 0) return; GenericElement<TextClipping> te = candidateTextClippings.maxPeek(); if (InterestModel.getInterestExpressedInTermVector(te.getGeneric().termVector()) > mostRecentTextWeight) { crawler.removeTextClippingFromPools(replaceMe); perhapsAddTextClippingToCrawler(); // perhapsAddAdditionalTextSurrogate could call recycle on this container if (!this.isRecycled()) candidateTextClippings.insert(replaceMe); } } public synchronized void tryToGetBetterImageAfterInterestExpression(DocumentClosure replaceMe) { if (candidateImageClosures == null || candidateImageClosures.size() == 0) return; DocumentClosure aie = candidateImageClosures.maxPeek(); if (InterestModel.getInterestExpressedInTermVector(aie.termVector()) > mostRecentImageWeight) { crawler.removeImageClippingFromPools(replaceMe); perhapsAddImageClosureToCrawler(); candidateImageClosures.insert(replaceMe); } } /** * @return true if there are no MediaElements that this Container is tracking */ protected boolean isEmpty() { return ((candidateImageClosures == null) || (candidateImageClosures.size()==0)) && ((candidateTextClippings == null) || (candidateTextClippings.size()==0)) && super.isEmpty(); } /** * Test for recycleable. * * @return true if this is still involved in collecting. */ @Override protected boolean isActive() { return super.isActive() && crawlingImages && crawlingTextClippings; } }