/**
*
*/
package ecologylab.bigsemantics.documentparsers;
import java.util.List;
import ecologylab.bigsemantics.collecting.ContainerWeightingStrategy;
import ecologylab.bigsemantics.collecting.Crawler;
import ecologylab.bigsemantics.collecting.DownloadStatus;
import ecologylab.bigsemantics.collecting.SemanticsGlobalScope;
import ecologylab.bigsemantics.metadata.builtins.Clipping;
import ecologylab.bigsemantics.metadata.builtins.RichDocument;
import ecologylab.bigsemantics.metadata.builtins.Document;
import ecologylab.bigsemantics.metadata.builtins.DocumentClosure;
import ecologylab.bigsemantics.model.text.InterestModel;
import ecologylab.collections.WeightSet;
import ecologylab.generic.Debug;
/**
* The per CompoundDocument component of the basic Crawler, involving outlinks, but not ImageClippings or TextClippings.
*
* @author andruid
*/
public class RichDocumentParserCrawlerResult<CR extends Crawler> extends Debug
implements ParserResult
{
protected RichDocument richDocument;
protected final SemanticsGlobalScope semanticsSessionScope;
protected final CR crawler;
private WeightSet<DocumentClosure> candidateLocalOutlinks;
boolean crawlingOutlinks;
protected static final double MIN_WEIGHT_THRESHOLD = 0.;
protected boolean useFirstCandidateWeight = true;
protected boolean recycled;
/** Number of surrogates from this container in a candidate pool */
protected int numSurrogatesFrom = 0;
public RichDocumentParserCrawlerResult(RichDocument compoundDocument)
{
this.richDocument = compoundDocument;
this.semanticsSessionScope = compoundDocument.getSemanticsScope();
this.crawler = (CR) semanticsSessionScope.getCrawler();
}
//////////////////////////////////////// candidates loops state ////////////////////////////////////////////////////////////
public void addCandidateOutlink (Document newOutlink )
{
if (!newOutlink.isSeed() && newOutlink.getDownloadStatus() != DownloadStatus.DOWNLOAD_DONE)
{
DocumentClosure documentClosure = newOutlink.getOrConstructClosure();
if (documentClosure != null && documentClosure.getDownloadStatus() == DownloadStatus.UNPROCESSED)
{
if (candidateLocalOutlinks == null)
candidateLocalOutlinks = new WeightSet<DocumentClosure>(new ContainerWeightingStrategy(InterestModel.getPIV()));
candidateLocalOutlinks.insert(documentClosure);
}
}
}
protected int clippingPoolPriority()
{
int result = useFirstCandidateWeight ? (richDocument.isSeed() ? 0 : 1) : 2;
useFirstCandidateWeight = false;
return result;
}
/**
*
* 1. First, only one surrogate goes to candidate pool.
* 2. Good looking surrogates, number of surrogates from current container, and users' interest
* expression will determine to bring more surrogates from current container to the candidate pool.
* @param getText
*/
protected synchronized void perhapsAddOutlinkClosureToCrawler ( )
{
if (candidateLocalOutlinks == null || candidateLocalOutlinks.size() == 0)
{
makeInactiveAndConsiderRecycling();
return;
}
double maxWeight = candidateLocalOutlinks.maxWeight();
boolean doRecycle = true;
if (maxWeight > MIN_WEIGHT_THRESHOLD)
{
DocumentClosure candidate = candidateLocalOutlinks.maxSelect();
doRecycle = !crawler.addClosureToPool(candidate); // successful add means do not recycle
}
else
{
//Debug only
debug("This container failed to provide a decent container so is going bye bye, max weight was " + maxWeight );
}
if (doRecycle)
makeInactiveAndConsiderRecycling();
}
private void makeInactiveAndConsiderRecycling()
{
crawlingOutlinks = false;
recycle();
}
private void considerRecycling()
{
if (isActive())
recycle();
else
debug("DIDNT RECYCLE AFTER CONSIDERATION.\nCONTAINERS_ACTIVE: "
+ crawlingOutlinks
/* + "\tTEXT_SURROGATES_ACTIVE: "
+ additionalTextSurrogatesActive
+ "\tIMAGE_SURROGATES_ACTIVE: "
+ additionalImgSurrogatesActive */);
}
/**
* Test for recycleable.
*
* @return true if this is still involved in collecting.
*/
protected boolean isActive()
{
return crawlingOutlinks;
}
public void collect()
{
for (Clipping clipping: richDocument.getClippings())
{
collect(clipping);
}
initiateCollecting();
}
/**
* Start up collecting loops -- DocumentClosures only.
*/
protected void initiateCollecting()
{
crawlingOutlinks = true;
perhapsAddOutlinkClosureToCrawler();
}
protected void collect(Clipping clipping)
{
// try collecting the outlink
DocumentClosure outlinkClosure = clipping.getOutlinkClosure();
if (outlinkClosure == null)
{
List<Document> outlinks = clipping.getOutlinks();
if(outlinks != null && outlinks.size() > 0)
outlinkClosure = outlinks.get(0).getOrConstructClosure();
}
if (outlinkClosure != null && semanticsSessionScope.isLocationNew(outlinkClosure.location()))
crawler.addClosureToPool(outlinkClosure);
}
protected boolean isEmpty()
{
return outlinksIsEmpty();
}
protected boolean outlinksIsEmpty()
{
return ((candidateLocalOutlinks == null) || (candidateLocalOutlinks.size() == 0));
}
public DocumentClosure swapNextBestOutlinkWith(DocumentClosure c)
{
if (outlinksIsEmpty())
return null;
synchronized (candidateLocalOutlinks)
{
candidateLocalOutlinks.insert(c);
return candidateLocalOutlinks.maxSelect();
}
}
public boolean isRecycled()
{
return recycled;
}
@Override
public synchronized void recycle()
{
if (!recycled)
{
recycled = true;
//FIXME -- IMPLEMENT RECYCLE!!!
}
}
}