MainContentProcessor.java example

Explorer

meaningfulweb-master
- meaningfulweb-app
  - src
    - main
      - java
        org
        meaningfulweb
        servlet
        FileFactoryBean.java
        HtmlExtractorController.java
        MeaningfulWebServlet.java
- meaningfulweb-core
  - src
    - main
      - java
        org
        meaningfulweb
        api
        MeaningfulWebObject.java
        MetaContentExtractor.java
        StressTest.java
        cext
        Extract.java
        ExtractForm.java
        ExtractUtils.java
        HtmlContentPipeline.java
        HtmlContentProcessor.java
        HtmlContentProcessorFactory.java
        HtmlExtractor.java
        processors
        ArticleProcessor.java
        BestImageProcessor.java
        BoilerpipeArticleProcessor.java
        DomainSpecifiedImageProcessor.java
        ElementProcessor.java
        FullContentProcessor.java
        HyperlinkProcessor.java
        ImageProcessor.java
        MainContentProcessor.java
        MeaningfulwebCompositeProcessor.java
        OpengraphContentProcessor.java
        ParagraphProcessor.java
        RegexProcessor.java
        ScriptProcessor.java
        SystemCommandProcessor.java
        TwitpicExtractionHandler.java
        XPathCleanerProcessor.java
        XPathProcessor.java
        detector
        DetectorFactory.java
        imgext
        ExtractedContents.java
        ImageFetcher.java
        ImageFilter.java
        ImageHeader.java
        ImageInfo.java
        ImageMeta.java
        ImageProp.java
        ImageSelector.java
        ImageSizeExtractor.java
        util
        EncodingUtils.java
        HTMLOutputter.java
        HtmlExtractUtils.java
        ImageUtil.java
        JDomUtils.java
        JsonUtils.java
        ProcessResponse.java
        ProcessUtils.java
        SystemCommand.java
        TempDirUtils.java
        URIUtils.java
        URLUtil.java
        XMLUtils.java
        domain
        DomainSuffix.java
        DomainSuffixes.java
        DomainSuffixesReader.java
        TopLevelDomain.java
        http
        HttpClientFactory.java
        HttpClientService.java
        HttpComponentsServiceImpl.java
        HttpException.java
        security
        AuthenticationService.java
        ReloadableFileAuthenticationServiceImpl.java
    - test
      - java
        org
        meaningfulweb
        core
        test
        MWCoreTest.java
- meaningfulweb-opengraph
  - src
    - main
      - java
        org
        meaningfulweb
        opengraph
        OGObject.java
        OpenGraphContentHandler.java
        OpenGraphParser.java
        OpenGraphVocabulary.java
    - test
      - java
        org
        meaningfulweb
        opengraph
        test
        Og4jTestCase.java
        Og4jTestSuite.java

package org.meaningfulweb.cext.processors;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.meaningfulweb.cext.HtmlContentProcessor;
import org.meaningfulweb.util.XMLUtils;

import org.apache.commons.lang.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.jdom.Comment;
import org.jdom.Content;
import org.jdom.Document;
import org.jdom.Element;
import org.jdom.Text;

public class MainContentProcessor
  extends HtmlContentProcessor {

  public static final Log LOG = LogFactory.getLog(MainContentProcessor.class);

  private double threshold = 10f;
  private int minTextLength = 20;
  private int minWords = 0;
  private int minLinks = 2;
  private int maxRecurseDepth = 250;
  private double linkListThreshold = .70f;

  private boolean extractHtml = true;
  private boolean extractText = true;

  private boolean isContainerSetup(int linkCount, int cntrCount) {
    if (cntrCount <= linkCount) {
      return ((float)cntrCount / (float)linkCount) > linkListThreshold;
    }
    else {
      return ((float)linkCount / (float)cntrCount) > linkListThreshold;
    }
  }

  private static Set<String> removeElements = new HashSet<String>();
  static {
    String[] remove = {"head", "script", "noscript", "style", "form", "meta",
      "input", "iframe", "embed", "hr", "img", "link", "label"};
    removeElements.addAll(Arrays.asList(remove));
  }

  private static Set<String> containerElements = new HashSet<String>();
  static {
    String[] container = {"div", "table", "td", "th", "tr", "tbody", "thead",
      "tfoot", "col", "colgroup", "ul", "ol", "li", "html", "center", "span"};
    containerElements.addAll(Arrays.asList(container));
  }

  private void cleanNodes(int level, Content node, Set<Content> remove) {

    // don't go on forever, spider traps can kill JVM through stack overflow
    if (level == maxRecurseDepth) {
      return;
    }

    if (node instanceof Element) {
      Element elem = (Element)node;
      String name = StringUtils.lowerCase(elem.getName());
      if (removeElements.contains(name)) {
        remove.add(node);
      }
      List<Content> children = elem.getContent();
      if (children != null && children.size() > 0) {
        for (Content child : children) {
          cleanNodes(++level, child, remove);
        }
      }
    }
    else if (node instanceof Comment) {
      remove.add(node);
    }
  }

  private Map cleanLinkContainers(int level, Content node, Set<Content> remove) {

    // don't go on forever, spider traps can kill JVM through stack overflow
    if (node == null || level == maxRecurseDepth) {
      return null;
    }

    int linkCount = 0;
    int liCount = 0;
    int textLength = 0;
    int wordCount = 0;
    int nodeCount = 0;
    int hTagCount = 0;
    boolean delete = false;
    float ratio = 0.0f;
    float totalRatio = 0.0f;
    boolean linkContainer = false;

    if (node instanceof Element) {

      nodeCount++;
      Element elem = (Element)node;
      String name = StringUtils.lowerCase(elem.getName());

      List<Content> children = elem.getContent();
      if (children != null && children.size() > 0) {
        for (Content child : children) {
          Map data = cleanLinkContainers(++level, child, remove);
          if (data != null) {
            boolean deleteChild = (Boolean)data.get("delete");
            if (deleteChild) {
              remove.add(child);
            }
            else {
              textLength += (Integer)data.get("textLength");
            }
            linkCount += (Integer)data.get("linkCount");
            wordCount += (Integer)data.get("wordCount");
            nodeCount += (Integer)data.get("nodeCount");
            totalRatio += (Float)data.get("ratio");
            liCount += (Integer)data.get("liCount");
            hTagCount += (Integer)data.get("hTagCount");
          }
        }
      }

      if (name.equalsIgnoreCase("a")) {
        linkCount++;
      }
      else if (name.equalsIgnoreCase("li")) {
        liCount++;
      }
      else if (name.equalsIgnoreCase("ul")) {
        if (linkCount >= 2 && liCount >= 2
          && isContainerSetup(linkCount, liCount)) {
          // heuristic for lists of links
          linkContainer = true;
          liCount = 0;
        }
      }
      else if (name.equalsIgnoreCase("h1") || name.equalsIgnoreCase("h2")
        || name.equalsIgnoreCase("h3") || name.equalsIgnoreCase("h4")
        || name.equalsIgnoreCase("h5")) {
        hTagCount++;
      }
      else if ((name.equalsIgnoreCase("div") || name.equalsIgnoreCase("p"))
        && linkCount >= 3 && hTagCount >= 3
        && isContainerSetup(linkCount, hTagCount)) {
        // heuristic for things that look like lists of links
        linkContainer = true;
        hTagCount = 0;
      }

      if (containerElements.contains(name)) {

        if (wordCount == 0) {
          delete = true;
        }
        else {
          float linkDenom = (float)linkCount > 0 ? linkCount : 1;
          ratio = (float)wordCount / linkDenom;
          totalRatio += ratio;
        }

        if (linkCount > minLinks && ratio < threshold && totalRatio < threshold) {
          delete = true;
        }
        if (linkContainer) {
          delete = true;
        }
        if (!name.equalsIgnoreCase("span")) {
          if (textLength < minTextLength || wordCount < minWords) {
            delete = true;
          }
        }
      }
    }
    else if (node instanceof Text) {
      Text text = (Text)node;
      String normalized = text.getTextNormalize();
      if (StringUtils.isNotBlank(normalized)) {
        boolean hasText = StringUtils.isNotBlank(normalized);
        if (hasText) {
          textLength += normalized.length();
          wordCount += StringUtils.split(normalized).length;
        }
      }
    }

    Map output = new HashMap();
    output.put("textLength", textLength);
    output.put("wordCount", wordCount);
    output.put("linkCount", linkCount);
    output.put("nodeCount", nodeCount);
    output.put("hTagCount", hTagCount);
    output.put("liCount", liCount);
    output.put("ratio", ratio);
    output.put("delete", delete);

    return output;
  }

  @Override
  public boolean processContent(Document doc) {

    Set<Content> remove = new LinkedHashSet<Content>();
    Element rootElem = doc.getRootElement();

    // remove specific non-content elements
    List<Content> contents = rootElem.getContent();
    for (Content child : contents) {
      cleanNodes(0, child, remove);
    }
    for (Content content : remove) {
      content.getParent().removeContent(content);
    }

    // clear the remove set
    remove.clear();

    // remove link container elements
    List<Content> containerContents = rootElem.getContent();
    for (Content child : containerContents) {
      cleanLinkContainers(0, child, remove);
    }
    for (Content content : remove) {
      content.getParent().removeContent(content);
    }

    // add the content html
    if (extractHtml) {
      String contentHtml = XMLUtils.toHtml(doc);
      // return full html if content html is empty
      if (StringUtils.isNotBlank(contentHtml)) {
        addExtractedValue("html", contentHtml);
      }
    }

    // get the content text
    if (extractText) {
      String contentText = XMLUtils.toText(doc);
      // return full text if the content text is empty
      if (StringUtils.isNotBlank(contentText)) {
        addExtractedValue("text", contentText);
      }
    }

    return true;
  }

  public double getThreshold() {
    return threshold;
  }

  public void setThreshold(double threshold) {
    this.threshold = threshold;
  }

  public int getMinTextLength() {
    return minTextLength;
  }

  public void setMinTextLength(int minTextLength) {
    this.minTextLength = minTextLength;
  }

  public int getMinWords() {
    return minWords;
  }

  public void setMinWords(int minWords) {
    this.minWords = minWords;
  }

  public int getMinLinks() {
    return minLinks;
  }

  public void setMinLinks(int minLinks) {
    this.minLinks = minLinks;
  }

  public int getMaxRecurseDepth() {
    return maxRecurseDepth;
  }

  public void setMaxRecurseDepth(int maxRecurseDepth) {
    this.maxRecurseDepth = maxRecurseDepth;
  }

  public double getLinkListThreshold() {
    return linkListThreshold;
  }

  public void setLinkListThreshold(double linkListThreshold) {
    this.linkListThreshold = linkListThreshold;
  }

  public boolean isExtractHtml() {
    return extractHtml;
  }

  public void setExtractHtml(boolean extractHtml) {
    this.extractHtml = extractHtml;
  }

  public boolean isExtractText() {
    return extractText;
  }

  public void setExtractText(boolean extractText) {
    this.extractText = extractText;
  }

}