/**
*
*/
package ecologylab.bigsemantics.actions;
import java.util.List;
import ecologylab.bigsemantics.collecting.Crawler;
import ecologylab.bigsemantics.documentparsers.DocumentParser;
import ecologylab.bigsemantics.documentparsers.SearchParser;
import ecologylab.bigsemantics.html.documentstructure.LinkType;
import ecologylab.bigsemantics.metadata.builtins.Document;
import ecologylab.bigsemantics.metadata.builtins.DocumentClosure;
import ecologylab.bigsemantics.seeding.Feed;
import ecologylab.bigsemantics.seeding.SearchState;
import ecologylab.bigsemantics.seeding.Seed;
import ecologylab.bigsemantics.seeding.SeedDistributor;
import ecologylab.bigsemantics.seeding.SeedDistributor.DistributorContinuation;
import ecologylab.generic.Continuation;
import ecologylab.net.ParsedURL;
import ecologylab.serialization.annotations.Hint;
import ecologylab.serialization.annotations.simpl_hints;
import ecologylab.serialization.annotations.simpl_inherit;
import ecologylab.serialization.annotations.simpl_scalar;
import ecologylab.serialization.annotations.simpl_tag;
/**
*
*/
@simpl_inherit
public @simpl_tag(SemanticActionStandardMethods.PARSE_DOCUMENT)
class ParseDocumentSemanticAction extends ContinuableSemanticAction
{
@simpl_scalar
@simpl_hints(Hint.XML_ATTRIBUTE)
protected boolean now = false;
@simpl_scalar
@simpl_hints(Hint.XML_ATTRIBUTE)
protected LinkType linkType = LinkType.OTHER_SEMANTIC_ACTION;
/**
* This attribute is meant to be used when we only require the top document to actually be sent to
* the infoCollector. It requires two strings
*/
@simpl_scalar
@simpl_hints(Hint.XML_ATTRIBUTE)
protected boolean onlyPickTopDocuments = false;
@simpl_scalar
@simpl_hints(Hint.XML_ATTRIBUTE)
protected int numberOfTopDocuments = 1;
public boolean isNow()
{
return now;
}
public boolean onlyPickTopDocument()
{
return onlyPickTopDocuments;
}
@Override
public String getActionName()
{
return SemanticActionStandardMethods.PARSE_DOCUMENT;
}
@Override
public void handleError()
{
// TODO Auto-generated method stub
}
public Object performBasic(Object obj)
{
if (isNow())
{
Document document = getOrCreateDocument(documentParser, linkType);
Document source = documentParser.getDocument();
if (source != null)
{
document.addInlink(source);
// if there is a source, we should re-use that dispatch target.
// e.g. search results from a search
List<Continuation<DocumentClosure>> continuations =
source.getOrConstructClosure().getContinuations();
document.getOrConstructClosure().addContinuations(continuations);
}
document.queueDownload();
}
return null;
}
@Override
public Object perform(Object obj)
{
if (sessionScope.isService())
{
return null;
}
// TODO -- add pref to choose performFull!
return sessionScope.hasCrawler() ? performBasic(obj) : null;
}
public Object performFull(Object obj)
{
Document document = getOrCreateDocument(documentParser, linkType);
if (document == null)
{
// candidateContainer can be null, e.g. the url is actually an image url (in which case
// infoCollector.getContainer() will return null). if this is the case we return immediately
// since there is no document to parse.
return null;
}
if (isNow())
{
parseDocumentNow(document);
}
else if (onlyPickTopDocument())
{
pickTopDocuments(document);
}
else
{
parseDocumentLater(document);
}
return null;
}
private void pickTopDocuments(Document candidateDocument)
{
Document ancestor = candidateDocument.getAncestor();
if (ancestor == null || candidateDocument == null)
{
warning("Parsing a document [" + candidateDocument + "] with ancestor ! - "
+ candidateDocument);
}
else
ancestor.addCandidateOutlink(candidateDocument);
// if currentIndex of foreach (if it is in one) == size - 1
// ancestor.perhapsAddAdditionalContainer
int curAnchorIndex = getArgumentInteger(CURRENT_INDEX, -1);
int anchorListSize = getArgumentInteger(SIZE, -1);
int outerLoopIndex = getArgumentInteger(OUTER_LOOP_INDEX, -1);
int outerLoopSize = getArgumentInteger(OUTER_LOOP_SIZE, -1);
// If the outerLoop exists, the outerIndex must be size -1, else outerLoop doesn't exist, so
// disregard.
boolean outerLoopEnd = outerLoopSize > 0 ? outerLoopIndex == outerLoopSize - 1 : true;
boolean loopEnd = anchorListSize > 0 && curAnchorIndex == anchorListSize - 1;
if (loopEnd && outerLoopEnd && ancestor != null)
{
debugT(" Reached end of iterations with outlinks size (" /* + ancestor.numOutlinks() */
+ ").\n\t\tPicking " + numberOfTopDocuments
+ " top documents from outlinks of container: " + ancestor);
int numDocumentsRemaining = numberOfTopDocuments;
while (numDocumentsRemaining-- > 0)
ancestor.perhapsAddDocumentClosureToPool();
}
}
protected void parseDocumentNow(Document document)
{
// In current implementation create_container_for_search may return null[for rejected
// domains.]
if (document != null)
{
DocumentClosure documentClosure = document.getOrConstructClosure();
if (continuation != null)
{
documentClosure.addContinuation(this); // for continuation semantic actions :-)!
}
if (documentClosure == null)
warning("Can't parse " + document.getLocation() + " because null container.");
else if (!distributeSeedingResults(this, documentParser, documentClosure, null))
documentClosure.queueDownload(); // case for normal documents
}
}
protected void parseDocumentLater(Document document)
{
DocumentClosure documentClosure = document.getOrConstructClosure();
if (documentClosure == null || documentClosure.downloadHasBeenQueued())
warning("Can't parse " + document.getLocation()
+ " because null container or already queued.");
else
{
final Crawler crawler = sessionScope.getCrawler();
if (!distributeSeedingResults(this, documentParser, documentClosure,
new DistributorContinuation()
{
@Override
public void distribute(DocumentClosure result)
{
if (crawler != null)
crawler.addClosureToPool(result); // ?? just curious:
// isn't result the same
// as documentClosure?!
}
}))
{
if (crawler != null)
{
if (continuation != null)
{
documentClosure.addContinuation(this); // for continuation semantic actions :-)!
}
crawler.addClosureToPool(documentClosure);
}
}
}
}
/**
* If possible, distribute a seeding result through SeedDistributor. e.g. for <search> or
* <feed>.
*
* @param action
* @param documentParser
* @param semanticsSessionScope
* @param resultContainer
* @return true if a seeding result is distributed; false if not applicable (e.g. a normal page).
*/
protected boolean distributeSeedingResults(SemanticAction action,
DocumentParser documentParser,
DocumentClosure resultContainer,
DistributorContinuation distributorContinuation)
{
SeedDistributor resultsDistributor = null;
Seed searchSeed = documentParser.getSeed();
String engineString = "";
if (searchSeed != null)
{
// its a search type
resultsDistributor = searchSeed.seedDistributer(sessionScope);
// will be non-null only for search result documents or feed item documents
if (resultsDistributor == null)
return false;
if (searchSeed instanceof SearchState)
{
engineString = ((SearchState) searchSeed).getEngine() + " ";
}
}
resultContainer.delete(); // remove from any and all candidate pools!
if (searchSeed instanceof SearchState)
{
SearchParser metaMetadataSearchParser = (SearchParser) documentParser;
int resultNum = metaMetadataSearchParser.getResultNum();
ParsedURL resultPURL = resultContainer.location();
final String msg =
"Queueing " + engineString + "search result " + (resultNum) + ": " + resultPURL;
sessionScope.displayStatus(msg);
int resultSoFar = metaMetadataSearchParser.getResultSoFar();
resultContainer.setSearchResult(resultsDistributor, resultSoFar);
// TODO -- add continuation semantic actions!!!
resultsDistributor.queueResult(resultContainer, distributorContinuation);
metaMetadataSearchParser.incrementResultSoFar();
return true;
}
else if (searchSeed instanceof Feed)
{
int rank = action.getArgumentInteger(SemanticActionNamedArguments.RANK, 0);
resultContainer.setSearchResult(resultsDistributor, rank);
// TODO -- add continuation semantic actions!!!
resultsDistributor.queueResult(resultContainer, distributorContinuation);
return true;
}
return false;
}
}