HTMLDOMParser.java example

Explorer
BigSemanticsJava-master
package ecologylab.bigsemantics.documentparsers;

import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
import java.util.ArrayList;

import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathExpressionException;

import org.w3c.dom.Node;
import org.w3c.dom.NodeList;

import ecologylab.appframework.types.prefs.PrefBoolean;
import ecologylab.bigsemantics.collecting.SemanticsSite;
import ecologylab.bigsemantics.html.DOMParserInterface;
import ecologylab.bigsemantics.html.ImgElement;
import ecologylab.bigsemantics.html.ParagraphText;
import ecologylab.bigsemantics.html.documentstructure.AnchorContext;
import ecologylab.bigsemantics.html.documentstructure.ImageFeatures;
import ecologylab.bigsemantics.html.documentstructure.LinkType;
import ecologylab.bigsemantics.html.documentstructure.RecognizedDocumentStructure;
import ecologylab.bigsemantics.html.documentstructure.SemanticAnchor;
import ecologylab.bigsemantics.html.dom.IDOMProvider;
import ecologylab.bigsemantics.html.utils.StringBuilderUtils;
import ecologylab.bigsemantics.metadata.builtins.RichDocument;
import ecologylab.bigsemantics.metadata.builtins.Document;
import ecologylab.bigsemantics.metadata.builtins.Image;
import ecologylab.bigsemantics.metadata.builtins.ImageClipping;
import ecologylab.bigsemantics.metadata.builtins.TextClipping;
import ecologylab.bigsemantics.model.text.utils.Filter;
import ecologylab.bigsemantics.seeding.SemanticsPrefs;
import ecologylab.generic.HashMapArrayList;
import ecologylab.generic.StringTools;
import ecologylab.net.ParsedURL;
import ecologylab.serialization.XMLTools;

/**
 * Parse HTML page and create DOM
 * 
 * @author eunyee
 * 
 */
public abstract class HTMLDOMParser<D extends Document> extends ContainerParser<D>
    implements DOMParserInterface
{

  static final PrefBoolean SHOW_PAGE_STRUCTURE_PREF =
      PrefBoolean.usePrefBoolean("show_page_structure", false);

  public static int        MAX_TEXT_CONTEXT_LENGTH  = 1500;

  /**
   * trim the text context under the limit if it is too long.
   * 
   * @param textContext
   * @return
   */
  // FIXME -- call site for this should really be when we build contexts in walkAndTagDOM()
  public static String trimTooLongContext(String textContext)
  {
    if (textContext.length() > MAX_TEXT_CONTEXT_LENGTH)
      return textContext.substring(0, MAX_TEXT_CONTEXT_LENGTH);
    else
      return textContext;
  }

  /** <code>Filter</code> that recognizes junk images from URL */
  public static final Filter   filter      = new Filter(); // url filtering

  /**
   * Root DOM of the current document
   */
  private org.w3c.dom.Document dom;

  protected IDOMProvider       provider;

  boolean                      indexPage   = false;

  boolean                      contentPage = false;

  private boolean              isFile      = false;

  protected boolean            bold;

  protected boolean            italic;

  public HTMLDOMParser()
  {
    super();
  }

  public org.w3c.dom.Document getDom() throws IOException
  {
    org.w3c.dom.Document result = this.dom;
    if (result == null)
    {
      result = createDom();
      this.dom = result;
    }
    return result;
  }

  /**
   * 
   * @return The DOM provided by the input stream or a bogus DOM with a root node empty
   * @throws IOException
   */
  private org.w3c.dom.Document createDom() throws IOException
  {
//    long t0 = System.currentTimeMillis();
    provider = getSemanticsScope().constructDOMProvider();
    InputStream inputStream = inputStream();
    Reader reader = reader();
    org.w3c.dom.Document document = null;
    try
    {
      document = reader != null ? provider.parseDOM(reader, null)
                                : provider.parseDOM(inputStream, null);
//      getLogRecord().setMsDomCreation(System.currentTimeMillis() - t0);
    }
    finally
    {
      if (inputStream != null)
      {
        inputStream.close();
      }
    }
    return document;
  }

  /**
   * 
   * @return The root node of the document, which should be <html>.
   * @throws IOException
   */
  public Node getRootNode() throws IOException
  {
    return getDom();
  }

  /**
   * Andruid says: NEVER override this method when you parse HTML. Instead, override postParse().
   * 
   * @throws IOException
   */
  @Override
  abstract public void parse() throws IOException;

  @Override
  public void recycle()
  {
    dom = null;
    provider = null;
    super.recycle();
  }

  /**
   * Called when the parser see's the <code><title> tag.
   */
  public void setTitle(Node titleNode)
  {
    StringBuilder title = null;
    NodeList children = titleNode.getChildNodes();
    for (int i = 0; i < children.getLength(); i++)
    {
      Node node = children.item(i);
      if (node.getNodeType() == Node.TEXT_NODE)
      {
        title = StringBuilderUtils.trimAndDecodeUTF8(title, node, 0, true);
        if (title != null)
        {
          XMLTools.unescapeXML(title);
          getDocument().hwSetTitle(StringTools.toString(title));
          StringBuilderUtils.release(title);
        }
        break;
      }
    }
  }

  /**
   * Create TextElement and add to the localCollection in Container.
   */
  public void constructTextClipping(ParagraphText paraText)
  {
    if ((paraText != null) && (paraText.length() > 0))
    {
      StringBuilder buffy = paraText.getBuffy();
      if (buffy.indexOf("@") == -1) // filter out paragraphs with email addresses
      {
        TextClipping textClipping = new TextClipping(getSemanticsScope()
            .getMetaMetadataRepository().getMMByName(getSemanticsScope().TEXT_TAG));
        textClipping.setText(StringTools.toString(buffy));
        textClipping.setSourceDoc(getDocument());
        ((RichDocument) getDocument()).addClipping(textClipping);
      }
    }
  }

  public int numExtractedClippings()
  {
    return ((RichDocument) getDocument()).numClippings();
  }

  /**
   * For each anchorContext: create purl and check to see Aggregates AnchorContext by their
   * destination hrefs. sets the metadata creates a container adds an outlink from the ancestor
   * 
   */
  public void generateCandidateContainersFromContexts(ArrayList<AnchorContext> anchorContexts,
                                                      boolean fromContentBody)
  {
    HashMapArrayList<ParsedURL, ArrayList<AnchorContext>> hashedAnchorContexts =
        new HashMapArrayList<ParsedURL, ArrayList<AnchorContext>>();
    for (AnchorContext anchorContext : anchorContexts)
    {

      ParsedURL destHref = anchorContext.getHref();
      if (destHref.isImg())
      { // The href associated is actually an image. Create a new img element and associate text to
        // it.
        Image newImage = getSemanticsScope().getOrConstructImage(destHref);
        if (newImage != null)
          newImage.constructClipping(getDocument(), null, null, anchorContext.getAnchorText());
        continue;
      }

      ArrayList<AnchorContext> arrayList = hashedAnchorContexts.get(destHref);
      if (arrayList == null)
      {
        arrayList = new ArrayList<AnchorContext>();
        hashedAnchorContexts.put(destHref, arrayList);
      }
      arrayList.add(anchorContext);
    }
    // Now that we have aggregated AnchorContext,
    // We generate One SemanticAnchor per purl, that aggregates all the semantics of the set of
    // anchorContexts
    for (ParsedURL hrefPurl : hashedAnchorContexts.keySet())
    {
      ArrayList<AnchorContext> anchorContextsPerHref = hashedAnchorContexts.get(hrefPurl);

      SemanticAnchor semanticAnchor =
          new SemanticAnchor(fromContentBody ? LinkType.WILD_CONTENT_BODY : LinkType.WILD,
                             hrefPurl,
                             anchorContextsPerHref,
                             purl(),
                             1);

      handleSemanticAnchor(semanticAnchor, hrefPurl);
    }
  }

  protected void handleSemanticAnchor(SemanticAnchor semanticAnchor, ParsedURL hrefPurl)
  {
    // FIXME -- should we depend on Seeding here?? or do this in post-processing for
    // CompoundDocumentParserCrawlerResult??
    if (hrefPurl != null && !hrefPurl.isNull() && getSemanticsScope().accept(hrefPurl))
    {
      Document hrefDocument = getSemanticsScope().getOrConstructDocument(hrefPurl);
      if (hrefDocument == null || hrefDocument.isRecycled())
      {
        warning("hrefDocument is null or recycled: " + hrefPurl);
        return; // Should actually raise an exception, but this could happen when a container is not
        // meant to be reincarnated
      }
      Document sourceDocument = getDocument();

      hrefDocument.addSemanticInlink(semanticAnchor, sourceDocument);
      sourceDocument.addCandidateOutlink(hrefDocument);
    }
  }

  public void setRecognizedDocumentStructure(Class<? extends RecognizedDocumentStructure> pageType)
  {
    if (SHOW_PAGE_STRUCTURE_PREF.value())
    {
      RichDocument metadata = (RichDocument) getDocument();
      if (metadata != null)
        metadata.setPageStructure(pageType.getSimpleName());
      else
        error("Can't setPageStructure() cause NULL Metadata :-(");
    }
  }

  @Override
  public void setContent()
  {
    contentPage = true;
  }

  @Override
  public void setIndexPage()
  {
    indexPage = true;
  }

  @Override
  public boolean isIndexPage()
  {
    return indexPage;
  }

  @Override
  public boolean isContentPage()
  {
    return contentPage;
  }

  @Override
  public void removeTheContainerFromCandidates(ParsedURL containerPURL)
  {
    warning("Not Implemented: removeTheContainerFromCandidates(" + containerPURL);
  }

  /**
   * Parser found a bold (or strong) tag or an end bold tag.
   */
  public void setBold(boolean on)
  {
    bold = on;
  }

  /**
   * Parser found an italic (or em) tag or an end italic tag.
   */
  public void setItalic(boolean on)
  {
    italic = on;
  }

  protected ParsedURL buildAndFilterPurl(String urlString)
  {
    ParsedURL result = buildPurl(urlString);
    return (result != null) && filterPurl(result) ? result : null;
  }

  /**
   * Filters the parsedURL to check if <li>infoCollector accepts <li>name does not start with File
   * <li>is Crawlable <br>
   * Do less checking if it's drag'n'drop (container==null)
   * 
   * @param urlString
   * @return
   */
  protected boolean filterPurl(ParsedURL parsedURL)
  {
    Document document = getDocument();
    // FIXME -- should we depend on Seeding here?? or do this in post-processing for
    // CompoundDocumentParserCrawlerResult??
    return (parsedURL != null &&
            getSemanticsScope().accept(parsedURL) && (!parsedURL.getName().startsWith("File:") &&
                                                      parsedURL.crawlable() &&
                                                      !document.isJustCrawl() && (document == null
                                                                                  || parsedURL
                                                                                      .isImg()
                                                                                  || (isFile && parsedURL
                                                                                      .isHTML()) || !isFile)));
  }

  protected ParsedURL buildPurl(String urlString)
  {
    Document sourceDocument = getDocument();
    return sourceDocument.isAnonymous() ?
        ParsedURL.createFromHTML(null, urlString, false) :
        sourceDocument.getLocation().createFromHTML(urlString, isSearchPage());
  }

  /**
   * add an image+text surrogate for this that was extracted from a different document. FIXME this
   * currently does the same thing as a surrogate extracted from this, but we might want to make a
   * special collection for these "anchor surrogates".
   * 
   * Really, this should be setting the outlink somehow...
   */
  public ImageClipping constructAnchorImageClipping(ImgElement imgNode, ParsedURL anchorHref)
  {
    RichDocument source = (RichDocument) getDocumentClosure().getDocument();
    ImageClipping clipping = constructImageClipping(getDocument(), source, null, imgNode);
    // CompoundDocument outlink = (CompoundDocument)
    // getSemanticsScope().getOrConstructDocument(anchorHref);
    // clipping.setOutlink(outlink);
    return clipping;
  }

  /**
   * create image and text surrogates for this HTML document, and add these surrogates into the
   * localCollection in Container.
   */
  public ImageClipping constructImageClipping(ImgElement imgNode, ParsedURL anchorHref)
  {
    if (getDocumentClosure().location().equals(anchorHref))
      debug("This should be something else here!!");
    debug("PART 1  " + anchorHref);
    Document outlink = getSemanticsScope().getOrConstructDocument(anchorHref);
    Document sourceDocument = getDocument();
    return constructImageClipping(sourceDocument, sourceDocument, outlink, imgNode);
  }

  /**
   * Construct an ImageClipping, associating it properly in the hypermedia graph.
   * 
   * @param basisDocument
   *          The CompoundDocument to add the clipping to.
   * @param sourceDocument
   *          The CompoundDocument to be listed as the Clipping's source. The one it is a surrogate
   *          for. Usually the same as basisDocument, but for a surrogate for X, found in Y, instead
   *          uses outlink here.
   * @param outlink
   *          The Document to be listed as the Clipping's href destination.
   * @param imgNode
   *          Representation of the source HTML + textContext and additional extractedCaption.
   * 
   * @return
   */
  public ImageClipping constructImageClipping(Document basisDocument,
                                              Document sourceDocument,
                                              Document outlink,
                                              ImgElement imgNode)
  {
    ParsedURL srcPurl = imgNode.getSrc();

    ImageClipping result = null;
    if (srcPurl != null)
    {
      int width = imgNode.getWidth();
      int height = imgNode.getHeight();
      int mimeIndex = srcPurl.mediaMimeIndex();
      boolean isMap = imgNode.isMap();

      switch (ImageFeatures.designRole(width, height, mimeIndex, isMap))
      {
      case ImageFeatures.INFORMATIVE:
      case ImageFeatures.UNKNOWN:
        String alt = imgNode.getAlt();

        if (alt != null)
          alt = alt.trim();

        Image image = getSemanticsScope().getOrConstructImage(srcPurl);
        if (image == null)
          return null;
        image.setWidth(width);
        image.setHeight(height);

        result = image.constructClipping(basisDocument,
                                         sourceDocument,
                                         outlink,
                                         alt,
                                         imgNode.getTextContext());
        result.setXpath(imgNode.xpath());
        break;
      case ImageFeatures.UN_INFORMATIVE:
      default:
        getSemanticsScope().getLocalDocumentCollection().registerUninformativeImage(srcPurl);
      }
    }
    return result;
  }

  public static boolean isAd(ParsedURL hrefPurl)
  {
    String lc = hrefPurl.lc();
    boolean filterMatch = SemanticsPrefs.FILTER_OUT_ADS.value() && filter.matchLc(lc);
    return filterMatch;
  }

  /**
   * @return true if <code>this</code> is a search page, and so needs special parsing of URLs, to
   *         unpack nested entries. false in all other (usual) cases.
   */
  public boolean isSearchPage()
  {
    return false;
  }

  protected void findFaviconPath(Document doc, XPath xpath)
  {
    if (dom == null)
    {
      warning("DOM is null (maybe direct binding case), not doing favicon looking up by xpath.");
      return;
    }

    String favi_res = "";
    try
    {
      favi_res = xpath.evaluate("//link[@rel=\"shortcut icon\"]/@href", dom);
    }
    catch (XPathExpressionException e)
    {
      e.printStackTrace();
    }

    if (favi_res != null && favi_res != "")
    {
      // Found one
      // System.out.println("Got a path: " + favi_res);
      SemanticsSite site = doc.getSite();
      site.setFaviconPath(favi_res, doc.getLocation());

    }
    else
    {
      // Did not. Look in the root.
    }
  }

}