DocumentParser.java example

Explorer
BigSemanticsJava-master
/**
 * 
 */
package ecologylab.bigsemantics.documentparsers;

import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
import java.io.UnsupportedEncodingException;
import java.util.HashSet;
import java.util.List;

import ecologylab.bigsemantics.actions.SemanticsConstants;
import ecologylab.bigsemantics.collecting.SemanticsGlobalScope;
import ecologylab.bigsemantics.downloadcontrollers.DownloadController;
import ecologylab.bigsemantics.logging.DocumentLogRecord;
import ecologylab.bigsemantics.metadata.Metadata;
import ecologylab.bigsemantics.metadata.builtins.Document;
import ecologylab.bigsemantics.metadata.builtins.DocumentClosure;
import ecologylab.bigsemantics.metametadata.MetaMetadata;
import ecologylab.bigsemantics.metametadata.MetaMetadataCompositeField;
import ecologylab.bigsemantics.metametadata.MetaMetadataRepository;
import ecologylab.bigsemantics.seeding.Seed;
import ecologylab.collections.Scope;
import ecologylab.generic.Debug;
import ecologylab.generic.ReflectionTools;
import ecologylab.net.ParsedURL;

/**
 * Super class for all document parser types. This class obtains the connection to a document. A
 * parse method may be present to process the document.
 * <p/>
 * Their role is to translate a Document into some kind of metadata and semantic actions. They start
 * with a PURL, which the static connect() method translates into an PURLConnection, and an
 * appropriate instance of a subclass. The {@link #parse() parse()} method is then called to
 * translate the document into the semantic model, using the PURLConnection.
 * <p/>
 * The translation from a PURL to an appropriate subclass instance is performed using a combination
 * of the PURLs extension, if it has a useful one, and then the mime-type returned by the
 * URLConnection response header. These keys are then used to perform a lookup in one of the
 * registries maintained in this class.
 * 
 * @author andruid
 * @author eunyee
 */
@SuppressWarnings(
{ "rawtypes", "unchecked" })
abstract public class DocumentParser<D extends Document> extends Debug
{

  static final Scope<Class<? extends DocumentParser>> bindingParserMap     = new Scope<Class<? extends DocumentParser>>();

  static final Scope<Class<? extends DocumentParser>> registryByMimeType   = new Scope<Class<? extends DocumentParser>>();

  static final Scope<Class<? extends DocumentParser>> registryBySuffix     = new Scope<Class<? extends DocumentParser>>();

  static final Scope<Class<? extends DocumentParser>> registryByClassName  = new Scope<Class<? extends DocumentParser>>();

  static final HashSet<String>                        NO_PARSER_SUFFIX_MAP = new HashSet<String>();

  static
  {
    register(SemanticsConstants.DIRECT_BINDING_PARSER, DirectBindingParser.class);
    register(SemanticsConstants.XPATH_PARSER, XPathParser.class);
    register(SemanticsConstants.FEED_PARSER, FeedParser.class);
    register(SemanticsConstants.HTML_IMAGE_DOM_TEXT_PARSER,
             HTMLDOMImageTextParser.class);
    register(SemanticsConstants.PDF_PARSER, PdfParser.class);
  }

  /**
   * Allow clients to register or re-register parsers by name.
   * 
   * This can be useful when a client needs to provide an alternative implementation for an parser.
   * For example, the semantics service may need to use a different algorithm for HTML image text
   * clipping derivation in another parser.
   * 
   * @param binding
   * @param parserClass
   */
  public static void register(String binding, Class<? extends DocumentParser> parserClass)
  {
    bindingParserMap.put(binding, parserClass);
  }

  /**
   * Get registered parser by binding.
   * 
   * @param binding
   * @param semanticsScope
   * @param documentClosure
   * @param downloadController
   * @return
   */
  public static DocumentParser getByBinding(String binding,
                                            SemanticsGlobalScope semanticsScope,
                                            DocumentClosure documentClosure,
                                            DownloadController downloadController)
  {
    DocumentParser result = null;
    if (binding != null)
    {
      Class<? extends DocumentParser> documentTypeClass =
          (Class<? extends DocumentParser>) bindingParserMap.get(binding);

      if (documentTypeClass != null)
      {
        Object[] constructorArgs = new Object[1];
        constructorArgs[0] = semanticsScope;
        result = ReflectionTools.getInstance(documentTypeClass);

        result.setSemanticsScope(semanticsScope);
        result.setDocumentClosure(documentClosure);
        result.setDownloadController(downloadController);
      }
    }
    return result;
  }

  /**
   * Get registered parser by meta-metadata.
   * 
   * @param mmd
   * @param semanticsScope
   * @param documentClosure
   * @param downloadController
   * @return
   */
  public static DocumentParser getByMmd(MetaMetadata mmd,
                                        SemanticsGlobalScope semanticsScope,
                                        DocumentClosure documentClosure,
                                        DownloadController downloadController)
  {
    String binding = mmd.getParser();
    if (binding != null)
    {
      return DocumentParser.getByBinding(binding,
                                         semanticsScope,
                                         documentClosure,
                                         downloadController);
    }
    return null;
  }

  /**
   * Record the input PURL as not assocaited with any parser.
   * 
   * @param purl
   * @return true if purl is already recorded.
   */
  public static boolean isRegisteredNoParser(ParsedURL purl)
  {
    boolean result = false;
    if (purl != null)
    {
      String suffix = purl.suffix();
      if (suffix != null && suffix.length() > 0)
      {
        result = NO_PARSER_SUFFIX_MAP.contains(suffix);
        if (!result)
          NO_PARSER_SUFFIX_MAP.add(suffix);
      }
    }
    return result;
  }

  private SemanticsGlobalScope semanticsScope;

  private DownloadController   downloadController;

  private DocumentClosure      documentClosure;

  /**
   * Default constructor.
   */
  protected DocumentParser()
  {
    super();
  }

  public SemanticsGlobalScope getSemanticsScope()
  {
    return semanticsScope;
  }

  protected void setSemanticsScope(SemanticsGlobalScope semanticsScope)
  {
    this.semanticsScope = semanticsScope;
  }

  public DocumentClosure getDocumentClosure()
  {
    return documentClosure;
  }

  protected void setDocumentClosure(DocumentClosure documentClosure)
  {
    this.documentClosure = documentClosure;
  }

  public DownloadController getDownloadController()
  {
    return downloadController;
  }

  protected void setDownloadController(DownloadController downloadController)
  {
    this.downloadController = downloadController;
  }

  /**
   * @return The document.
   */
  public D getDocument()
  {
    return (D) documentClosure.getDocument();
  }

  /**
   * Subclasses can implement this for looking up downloaded document.
   * 
   * @param metadata
   * @return
   */
  protected Document lookupDownloadedDocument(Metadata metadata)
  {
    return null;
  }

  /**
   * Subclasses can implement this for looking up true meta-metadata and returning the right
   * metadata object.
   * 
   * @param repository
   * @param thisMetadata
   * @return
   */
  protected Metadata lookupTrueMetaMetadata(MetaMetadataRepository repository, Metadata thisMetadata)
  {
    return null;
  }

  /**
   * @return Meta-metadata of the corresponding document.
   */
  public MetaMetadataCompositeField getMetaMetadata()
  {
    Document document = documentClosure.getDocument();
    return document == null ? null : document.getMetaMetadata();
  }

  /**
   * Connects to a SeedDistributor, when appropriate.
   * 
   * @return null always for the default base class.
   */
  public Seed getSeed()
  {
    return null;
  }

  /**
   * @return A LogRecord that can be used to log events through the lifecycle of a Document.
   */
  public DocumentLogRecord getLogRecord()
  {
    return documentClosure.getLogRecord();
  }

  /**
   * Parse the document.
   * 
   * @throws IOException
   */
  public abstract void parse() throws IOException;

  /**
   * @return The ParsedURL value of the current document. If you want to get a URL value, you can
   *         use url() method in ParsedURL.
   */
  public ParsedURL purl()
  {
    if (documentClosure == null)
      return null;

    Document document = documentClosure.getDocument();
    if (document == null)
      return null;

    ParsedURL docPurl = document.getLocation();

    if (downloadController != null)
    {
      ParsedURL connPurl = downloadController.getOriginalLocation();
      if (connPurl == null)
      {
        List<ParsedURL> redirects = downloadController.getHttpResponse().getOtherPurls();
        if (redirects != null && redirects.size() > 0)
        {
          connPurl = redirects.get(0);
        }
      }

      if (docPurl == null || docPurl.isFile()
          || (connPurl != null && !connPurl.isFile()) || !semanticsScope.isService())
      {
        return connPurl;
      }
    }
    return docPurl;
  }

  /**
   * The document's PURL, without considering connection PURLs.
   * 
   * @return
   */
  public ParsedURL getTruePURL()
  {
    Document document = documentClosure.getDocument();
    return document == null ? null : document.getLocation();
  }

  /**
   * Use the DocumentClosure to close the current connection. Re-open a connection to the same
   * location. Use the same Document object; don't process re-directs, or anything like that.
   * Re-connect simply. Reset the purlConnection field of this to the new PURLConnection.
   * 
   * @return InputStream for the new connection.
   * @throws IOException
   */
  public InputStream reConnect() throws IOException
  {
    DownloadController downloadController = documentClosure.reConnect();
    this.downloadController = downloadController;
    return downloadController.getHttpResponse().getContentAsStream();
  }

  /**
   * @return The input stream to the raw document being downloaded, if any.
   * @throws UnsupportedEncodingException
   */
  protected InputStream inputStream() throws UnsupportedEncodingException
  {
    if (downloadController != null)
    {
      return downloadController.getHttpResponse().getContentAsStream();
    }
    return null;
  }

  /**
   * @return The reader of the raw document.
   */
  protected Reader reader()
  {
    return null;
  }

  /**
   * Handle I/O error.
   * 
   * @param e
   */
  public void handleIoError(Throwable e)
  {
    recycle();
  }

  /**
   * Free resources.
   */
  public void recycle()
  {
    semanticsScope = null;
    documentClosure = null;
  }

  /**
   * True if our analysis indicates the present AbstractContainer is an article, and not a
   * collection of links. This affects calls to getWeight() in the model!
   * 
   * @return true for an article. false for a collection of links (like a homepage).
   */
  public boolean isAnArticle()
  {
    return true;
  }

  public boolean isIndexPage()
  {
    return false;
  }

  public boolean isContentPage()
  {
    return false;
  }

  /**
   * Differentiates referential documents, like HTML from composite documents, like PDF.
   * 
   * @return true if images referred to by this document are stored within the document itself. The
   *         default implementation, here returns false.
   */
  public boolean isCompositeDocument()
  {
    return false;
  }

  /**
   * @return true to avoid building a DOM at the beginning.
   */
  public boolean doesDirectBinding()
  {
    return false;
  }

  public String toString()
  {
    ParsedURL purl = purl();
    String purlString = (purl != null) ? purl.toString() : null;
    if (purlString == null)
      purlString = "no purl";
    return super.toString() + "[" + purlString + "]";
  }

}