ParseDocumentSemanticAction.java example

Explorer
BigSemanticsJava-master
/**
 * 
 */
package ecologylab.bigsemantics.actions;

import java.util.List;

import ecologylab.bigsemantics.collecting.Crawler;
import ecologylab.bigsemantics.documentparsers.DocumentParser;
import ecologylab.bigsemantics.documentparsers.SearchParser;
import ecologylab.bigsemantics.html.documentstructure.LinkType;
import ecologylab.bigsemantics.metadata.builtins.Document;
import ecologylab.bigsemantics.metadata.builtins.DocumentClosure;
import ecologylab.bigsemantics.seeding.Feed;
import ecologylab.bigsemantics.seeding.SearchState;
import ecologylab.bigsemantics.seeding.Seed;
import ecologylab.bigsemantics.seeding.SeedDistributor;
import ecologylab.bigsemantics.seeding.SeedDistributor.DistributorContinuation;
import ecologylab.generic.Continuation;
import ecologylab.net.ParsedURL;
import ecologylab.serialization.annotations.Hint;
import ecologylab.serialization.annotations.simpl_hints;
import ecologylab.serialization.annotations.simpl_inherit;
import ecologylab.serialization.annotations.simpl_scalar;
import ecologylab.serialization.annotations.simpl_tag;

/**
 * 
 */
@simpl_inherit
public @simpl_tag(SemanticActionStandardMethods.PARSE_DOCUMENT)
class ParseDocumentSemanticAction extends ContinuableSemanticAction
{

  @simpl_scalar
  @simpl_hints(Hint.XML_ATTRIBUTE)
  protected boolean  now                  = false;

  @simpl_scalar
  @simpl_hints(Hint.XML_ATTRIBUTE)
  protected LinkType linkType             = LinkType.OTHER_SEMANTIC_ACTION;

  /**
   * This attribute is meant to be used when we only require the top document to actually be sent to
   * the infoCollector. It requires two strings
   */
  @simpl_scalar
  @simpl_hints(Hint.XML_ATTRIBUTE)
  protected boolean  onlyPickTopDocuments = false;

  @simpl_scalar
  @simpl_hints(Hint.XML_ATTRIBUTE)
  protected int      numberOfTopDocuments = 1;

  public boolean isNow()
  {
    return now;
  }

  public boolean onlyPickTopDocument()
  {
    return onlyPickTopDocuments;
  }

  @Override
  public String getActionName()
  {
    return SemanticActionStandardMethods.PARSE_DOCUMENT;
  }

  @Override
  public void handleError()
  {
    // TODO Auto-generated method stub

  }

  public Object performBasic(Object obj)
  {
    if (isNow())
    {
      Document document = getOrCreateDocument(documentParser, linkType);

      Document source = documentParser.getDocument();
      if (source != null)
      {
        document.addInlink(source);
        // if there is a source, we should re-use that dispatch target.
        // e.g. search results from a search
        List<Continuation<DocumentClosure>> continuations =
            source.getOrConstructClosure().getContinuations();
        document.getOrConstructClosure().addContinuations(continuations);
      }
      document.queueDownload();
    }
    return null;
  }

  @Override
  public Object perform(Object obj)
  {
    if (sessionScope.isService())
    {
      return null;
    }

    // TODO -- add pref to choose performFull!
    return sessionScope.hasCrawler() ? performBasic(obj) : null;
  }

  public Object performFull(Object obj)
  {

    Document document = getOrCreateDocument(documentParser, linkType);
    if (document == null)
    {
      // candidateContainer can be null, e.g. the url is actually an image url (in which case
      // infoCollector.getContainer() will return null). if this is the case we return immediately
      // since there is no document to parse.
      return null;
    }

    if (isNow())
    {
      parseDocumentNow(document);
    }
    else if (onlyPickTopDocument())
    {
      pickTopDocuments(document);
    }
    else
    {
      parseDocumentLater(document);
    }

    return null;
  }

  private void pickTopDocuments(Document candidateDocument)
  {
    Document ancestor = candidateDocument.getAncestor();
    if (ancestor == null || candidateDocument == null)
    {
      warning("Parsing a document [" + candidateDocument + "] with ancestor ! - "
          + candidateDocument);
    }
    else
      ancestor.addCandidateOutlink(candidateDocument);

    // if currentIndex of foreach (if it is in one) == size - 1
    // ancestor.perhapsAddAdditionalContainer
    int curAnchorIndex = getArgumentInteger(CURRENT_INDEX, -1);
    int anchorListSize = getArgumentInteger(SIZE, -1);
    int outerLoopIndex = getArgumentInteger(OUTER_LOOP_INDEX, -1);
    int outerLoopSize = getArgumentInteger(OUTER_LOOP_SIZE, -1);

    // If the outerLoop exists, the outerIndex must be size -1, else outerLoop doesn't exist, so
    // disregard.
    boolean outerLoopEnd = outerLoopSize > 0 ? outerLoopIndex == outerLoopSize - 1 : true;
    boolean loopEnd = anchorListSize > 0 && curAnchorIndex == anchorListSize - 1;

    if (loopEnd && outerLoopEnd && ancestor != null)
    {
      debugT(" Reached end of iterations with outlinks size (" /* + ancestor.numOutlinks() */
          + ").\n\t\tPicking " + numberOfTopDocuments
          + " top documents from outlinks of container: " + ancestor);
      int numDocumentsRemaining = numberOfTopDocuments;
      while (numDocumentsRemaining-- > 0)
        ancestor.perhapsAddDocumentClosureToPool();
    }
  }

  protected void parseDocumentNow(Document document)
  {

    // In current implementation create_container_for_search may return null[for rejected
    // domains.]
    if (document != null)
    {
      DocumentClosure documentClosure = document.getOrConstructClosure();
      if (continuation != null)
      {
        documentClosure.addContinuation(this); // for continuation semantic actions :-)!
      }
      if (documentClosure == null)
        warning("Can't parse " + document.getLocation() + " because null container.");
      else if (!distributeSeedingResults(this, documentParser, documentClosure, null))
        documentClosure.queueDownload(); // case for normal documents
    }
  }

  protected void parseDocumentLater(Document document)
  {
    DocumentClosure documentClosure = document.getOrConstructClosure();

    if (documentClosure == null || documentClosure.downloadHasBeenQueued())
      warning("Can't parse " + document.getLocation()
          + " because null container or already queued.");
    else
    {
      final Crawler crawler = sessionScope.getCrawler();
      if (!distributeSeedingResults(this, documentParser, documentClosure,
                                    new DistributorContinuation()
                                    {
                                      @Override
                                      public void distribute(DocumentClosure result)
                                      {
                                        if (crawler != null)
                                          crawler.addClosureToPool(result); // ?? just curious:
                                                                            // isn't result the same
                                                                            // as documentClosure?!
                                      }
                                    }))
      {
        if (crawler != null)
        {
          if (continuation != null)
          {
            documentClosure.addContinuation(this); // for continuation semantic actions :-)!
          }
          crawler.addClosureToPool(documentClosure);
        }
      }
    }
  }

  /**
   * If possible, distribute a seeding result through SeedDistributor. e.g. for <search> or
   * <feed>.
   * 
   * @param action
   * @param documentParser
   * @param semanticsSessionScope
   * @param resultContainer
   * @return true if a seeding result is distributed; false if not applicable (e.g. a normal page).
   */
  protected boolean distributeSeedingResults(SemanticAction action,
                                             DocumentParser documentParser,
                                             DocumentClosure resultContainer,
                                             DistributorContinuation distributorContinuation)
  {
    SeedDistributor resultsDistributor = null;
    Seed searchSeed = documentParser.getSeed();
    String engineString = "";

    if (searchSeed != null)
    {
      // its a search type
      resultsDistributor = searchSeed.seedDistributer(sessionScope);
      // will be non-null only for search result documents or feed item documents
      if (resultsDistributor == null)
        return false;

      if (searchSeed instanceof SearchState)
      {
        engineString = ((SearchState) searchSeed).getEngine() + " ";
      }
    }

    resultContainer.delete(); // remove from any and all candidate pools!

    if (searchSeed instanceof SearchState)
    {
      SearchParser metaMetadataSearchParser = (SearchParser) documentParser;
      int resultNum = metaMetadataSearchParser.getResultNum();
      ParsedURL resultPURL = resultContainer.location();
      final String msg =
          "Queueing " + engineString + "search result " + (resultNum) + ": " + resultPURL;
      sessionScope.displayStatus(msg);
      int resultSoFar = metaMetadataSearchParser.getResultSoFar();
      resultContainer.setSearchResult(resultsDistributor, resultSoFar);
      // TODO -- add continuation semantic actions!!!
      resultsDistributor.queueResult(resultContainer, distributorContinuation);
      metaMetadataSearchParser.incrementResultSoFar();

      return true;
    }
    else if (searchSeed instanceof Feed)
    {
      int rank = action.getArgumentInteger(SemanticActionNamedArguments.RANK, 0);
      resultContainer.setSearchResult(resultsDistributor, rank);
      // TODO -- add continuation semantic actions!!!
      resultsDistributor.queueResult(resultContainer, distributorContinuation);

      return true;
    }

    return false;
  }

}