package org.meaningfulweb.cext.processors; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.HashSet; import java.util.LinkedHashSet; import java.util.List; import java.util.Map; import java.util.Set; import org.meaningfulweb.cext.HtmlContentProcessor; import org.meaningfulweb.util.XMLUtils; import org.apache.commons.lang.BooleanUtils; import org.apache.commons.lang.StringUtils; import org.apache.commons.lang.math.NumberUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.jdom.Attribute; import org.jdom.Comment; import org.jdom.Content; import org.jdom.Document; import org.jdom.Element; import org.jdom.Text; public class ArticleProcessor extends HtmlContentProcessor { private boolean extractHtml = true; private boolean extractText = true; private boolean extractMedia = true; private final static String TEXT_SIZE = "textSize"; private final static String TOTAL_TEXT_IN_LINKS = "totalTextInLinks"; private final static String TOTAL_TEXT_SIZE = "totalTextSize"; private final static String TEXT_SIZE_SCORE = "textSizeScore"; private final static String TOTAL_LINKS = "totalLinks"; private final static String TOTAL_PTAGS = "totalPTags"; private final static String TOTAL_IMAGES = "totalImages"; private final static String TOTAL_H1TAGS = "totalH1Tags"; private final static String TOTAL_NODES = "totalNodes"; private final static String TOTAL_ELEMENT_NODES = "totalElementNodes"; private final static String TOTAL_TEXT_NODES = "totalTextNodes"; private final static String TOTAL_LEVELS = "totalLevels"; private final static String HAS_DIRECT_TEXT = "hasDirectText"; private final static String TEXT_NODE_SCORE = "textNodeScore"; private final static String LINK_SCORE = "linkScore"; private final static String LEVEL = "level"; private final static String POSITION = "position"; public static final Log LOG = LogFactory.getLog(ArticleProcessor.class); private static Set<String> removeElements = new HashSet<String>(); static { // removed "iframe", "label", "pre","blockquote", "code", "br" String[] remove = {"head", "script", "noscript", "style", "form", "meta", "input", "select", "textarea", "option", "hr", "link", "embed", "h1", "iframe"}; removeElements.addAll(Arrays.asList(remove)); } private static Set<String> ignoreElements = new HashSet<String>(); static { String[] remove = {"br"}; ignoreElements.addAll(Arrays.asList(remove)); } private static Set<String> mediaElements = new HashSet<String>(); static { String[] remove = {"embed", "img"}; mediaElements.addAll(Arrays.asList(remove)); } private int maxRecurseDepth = 250; private void removeProcessingAttributes(Element elem) { elem.removeAttribute(TEXT_SIZE); elem.removeAttribute(TOTAL_TEXT_SIZE); elem.removeAttribute(TOTAL_TEXT_IN_LINKS); elem.removeAttribute(TEXT_SIZE_SCORE); elem.removeAttribute(TOTAL_NODES); elem.removeAttribute(TOTAL_ELEMENT_NODES); elem.removeAttribute(TOTAL_TEXT_NODES); elem.removeAttribute(TOTAL_LINKS); elem.removeAttribute(TOTAL_PTAGS); elem.removeAttribute(TOTAL_H1TAGS); elem.removeAttribute(TOTAL_IMAGES); elem.removeAttribute(HAS_DIRECT_TEXT); elem.removeAttribute(TEXT_NODE_SCORE); elem.removeAttribute(LINK_SCORE); elem.removeAttribute(LEVEL); elem.removeAttribute(TOTAL_LEVELS); elem.removeAttribute(POSITION); } private List<String> getClassAndIdVals(Element elem) { List<String> attrVals = new ArrayList<String>(); Attribute classAttr = elem.getAttribute("class"); if (classAttr != null) { String classVal = StringUtils.lowerCase(classAttr.getValue()); if (StringUtils.isNotBlank(classVal)) { attrVals.add(classVal); } } Attribute idAttr = elem.getAttribute("id"); if (idAttr != null) { String idVal = StringUtils.lowerCase(idAttr.getValue()); if (StringUtils.isNotBlank(idVal)) { attrVals.add(idVal); } } return attrVals; } private void cleanStructureNodes(int level, Content node, Set<Content> remove) { // don't go on forever, spider traps can kill JVM through stack overflow if (level == maxRecurseDepth) { return; } if (node instanceof Element) { Element elem = (Element)node; String name = StringUtils.lowerCase(elem.getName()); List<String> attrVals = getClassAndIdVals(elem); String[] removeLabels = {"header", "menu", "footer", "masthead", "comment", "quotes", "resources", "tools", "headline", "caption", "share", "font", "opinion", "timestamp", "posted", "metadata", "toolbar", "disqus", "sign_up", "nav", "like", "advertisement", "sidebar", "print", "email", "more", "links", "enlarge", "tags", "breadcrumb", "facebook", "stumble", "twitter", "callout", "widget", "related", "announcement", "neighbor", "sponsor", "support", "flash"}; boolean shouldRemove = false; if (!StringUtils.equals("body", name)) { for (String attrVal : attrVals) { for (int i = 0; i < removeLabels.length; i++) { if (StringUtils.contains(attrVal, removeLabels[i])) { shouldRemove = true; remove.add(node); break; } } } } if (!shouldRemove) { Attribute styleAttr = elem.getAttribute("style"); if (styleAttr != null) { String styleVal = StringUtils.lowerCase(styleAttr.getValue()); if (StringUtils.isNotBlank(styleVal) & styleVal.matches(".*display:\\s*none.*")) { remove.add(node); } } } if (!shouldRemove) { List<Content> children = elem.getContent(); if (children != null && children.size() > 0) { for (Content child : children) { cleanStructureNodes(++level, child, remove); } } } } else if (node instanceof Comment) { remove.add(node); } } private void findHighestNodeScore(int level, Content node, List<Content> highest) { // don't go on forever, spider traps can kill JVM through stack overflow if (level == maxRecurseDepth) { return; } if (node instanceof Element) { Element elem = (Element)node; String name = StringUtils.lowerCase(elem.getName()); Attribute hdtAttr = elem.getAttribute(HAS_DIRECT_TEXT); if (hdtAttr != null && BooleanUtils.toBoolean(hdtAttr.getValue()) && !StringUtils.equals("body", name)) { Attribute tssAttr = elem.getAttribute(TEXT_SIZE); int textSizeScore = NumberUtils.toInt(tssAttr.getValue()); if (highest.size() == 0) { highest.add(node); } else { Element prevHighest = (Element)highest.get(0); Attribute prevTss = prevHighest.getAttribute(TEXT_SIZE); int prevTextSizeScore = NumberUtils.toInt(prevTss.getValue()); if (textSizeScore >= prevTextSizeScore) { highest.clear(); highest.add(node); } } } List<Content> children = elem.getContent(); if (children != null && children.size() > 0) { ++level; for (Content child : children) { findHighestNodeScore(level, child, highest); } } } } private void cleanNonContentNodes(int level, Content node, Set<Content> remove) { // don't go on forever, spider traps can kill JVM through stack overflow if (level == maxRecurseDepth) { return; } if (node instanceof Element) { Element elem = (Element)node; String name = StringUtils.lowerCase(elem.getName()); if (removeElements.contains(name)) { remove.add(node); } else { List<Content> children = elem.getContent(); if (children != null && children.size() > 0) { ++level; for (Content child : children) { cleanNonContentNodes(level, child, remove); } } } } else if (node instanceof Comment) { remove.add(node); } } private void cleanNonTextNodes(int level, Content node, Set<Content> remove) { if (level == maxRecurseDepth) { return; } if (node instanceof Element) { Element elem = (Element)node; String name = StringUtils.lowerCase(elem.getName()); if (ignoreElements.contains(name)) { return; } Attribute tiAttr = elem.getAttribute(TOTAL_IMAGES); if (tiAttr != null) { int totalImages = NumberUtils.toInt(tiAttr.getValue()); if (totalImages > 0) { return; } } Attribute ttnAttr = elem.getAttribute(TOTAL_TEXT_NODES); boolean removed = false; if (ttnAttr != null) { int totalTextNodes = NumberUtils.toInt(ttnAttr.getValue()); if (totalTextNodes == 0) { remove.add(node); removed = true; } } if (!removed) { Attribute linkScoreAttr = elem.getAttribute(LINK_SCORE); Attribute numLinksAttr = elem.getAttribute(TOTAL_LINKS); if (linkScoreAttr != null && numLinksAttr != null) { double linkScore = NumberUtils.toDouble(linkScoreAttr.getValue()); double numLinks = NumberUtils.toInt(numLinksAttr.getValue()); if (numLinks >= 3 && linkScore > 0.60) { remove.add(node); } } } List<Content> children = elem.getContent(); if (children != null && children.size() > 0) { ++level; for (Content child : children) { cleanNonTextNodes(level, child, remove); } } } } private Map<String, Object> scoreNodes(int level, Content node) { Map<String, Object> values = new HashMap<String, Object>(); // don't go on forever, spider traps can kill JVM through stack overflow if (level == maxRecurseDepth) { return null; } if (node instanceof Text) { Text textNode = (Text)node; String text = textNode.getText(); if (StringUtils.isNotBlank(text)) { int curTextSize = StringUtils.trim(text).length(); values.put(TEXT_SIZE, curTextSize); Element parent = node.getParentElement(); if (parent != null) { String parentName = StringUtils.lowerCase(parent.getName()); if (StringUtils.equals(parentName, "a")) { values.put(TOTAL_TEXT_IN_LINKS, curTextSize); values.put(TOTAL_LINKS, 1); } } return values; } } else if (node instanceof Element) { int totalTextSize = 0; int curTextSize = 0; int totalTextInLinks = 0; int totalNodes = 0; int totalElementNodes = 0; int totalTextNodes = 0; int totalLinks = 0; int totalPTags = 0; int totalImages = 0; int totalH1Tags = 0; int totalLevels = 0; boolean hasDirectText = false; Element elem = (Element)node; String name = StringUtils.lowerCase(elem.getName()); if (StringUtils.equals(name, "a")) { totalLinks++; } else if (StringUtils.equals(name, "p")) { totalPTags++; hasDirectText = true; } else if (StringUtils.equals(name, "img")) { totalImages++; } else if (StringUtils.equals(name, "h1")) { totalH1Tags++; } List<Content> children = elem.getContent(); if (children != null && children.size() > 0) { for (Content child : children) { if (child instanceof Element) { totalNodes++; totalElementNodes++; Element childElem = (Element)child; String childName = StringUtils.lowerCase(childElem.getName()); if (StringUtils.equals(childName, "p")) { hasDirectText = true; } } else if (child instanceof Text) { Text textNode = (Text)child; String text = textNode.getText(); if (StringUtils.isNotBlank(text)) { totalTextNodes++; totalNodes++; hasDirectText = true; } } Map<String, Object> nestedValues = scoreNodes(level + 1, child); if (nestedValues != null) { if (nestedValues.containsKey(TEXT_SIZE)) { curTextSize += (Integer)nestedValues.get(TEXT_SIZE); } if (nestedValues.containsKey(TOTAL_TEXT_IN_LINKS)) { totalTextInLinks += (Integer)nestedValues .get(TOTAL_TEXT_IN_LINKS); } if (nestedValues.containsKey(TOTAL_LINKS)) { totalLinks += (Integer)nestedValues.get(TOTAL_LINKS); } if (nestedValues.containsKey(TOTAL_PTAGS)) { totalPTags += (Integer)nestedValues.get(TOTAL_PTAGS); } if (nestedValues.containsKey(TOTAL_H1TAGS)) { totalH1Tags += (Integer)nestedValues.get(TOTAL_H1TAGS); } if (nestedValues.containsKey(TOTAL_IMAGES)) { totalImages += (Integer)nestedValues.get(TOTAL_IMAGES); } if (nestedValues.containsKey(TOTAL_ELEMENT_NODES)) { totalElementNodes += (Integer)nestedValues .get(TOTAL_ELEMENT_NODES); totalNodes += totalElementNodes; } if (nestedValues.containsKey(TOTAL_TEXT_NODES)) { totalTextNodes += (Integer)nestedValues.get(TOTAL_TEXT_NODES); totalNodes += totalTextNodes; } if (nestedValues.containsKey(TOTAL_NODES)) { totalNodes += (Integer)nestedValues.get(TOTAL_NODES); } if (nestedValues.containsKey(TOTAL_LEVELS)) { totalLevels = (Integer)nestedValues.get(TOTAL_LEVELS); } } } } totalTextSize += curTextSize; if (hasDirectText) { elem.setAttribute(TEXT_SIZE, String.valueOf(curTextSize)); values.put(TEXT_SIZE, curTextSize); } elem.setAttribute(TOTAL_TEXT_SIZE, String.valueOf(totalTextSize)); elem.setAttribute(TOTAL_TEXT_IN_LINKS, String.valueOf(totalTextInLinks)); elem.setAttribute(TEXT_SIZE_SCORE, String.valueOf(totalTextSize - totalTextInLinks)); elem.setAttribute(TOTAL_NODES, String.valueOf(totalNodes)); elem.setAttribute(TOTAL_ELEMENT_NODES, String.valueOf(totalElementNodes)); elem.setAttribute(TOTAL_TEXT_NODES, String.valueOf(totalTextNodes)); elem.setAttribute(TOTAL_LINKS, String.valueOf(totalLinks)); elem.setAttribute(TOTAL_PTAGS, String.valueOf(totalPTags)); elem.setAttribute(TOTAL_H1TAGS, String.valueOf(totalH1Tags)); elem.setAttribute(TOTAL_IMAGES, String.valueOf(totalImages)); elem.setAttribute(HAS_DIRECT_TEXT, String.valueOf(hasDirectText)); elem.setAttribute(LEVEL, String.valueOf(level)); elem.setAttribute(TOTAL_LEVELS, String.valueOf(totalLevels)); values.put(TOTAL_TEXT_SIZE, totalTextSize); values.put(TOTAL_TEXT_IN_LINKS, totalTextInLinks); values.put(TOTAL_LINKS, totalLinks); values.put(TOTAL_PTAGS, totalPTags); values.put(TOTAL_H1TAGS, totalH1Tags); values.put(TOTAL_IMAGES, totalImages); values.put(TOTAL_NODES, totalNodes); values.put(TOTAL_ELEMENT_NODES, totalElementNodes); values.put(TOTAL_TEXT_NODES, totalTextNodes); values.put(TOTAL_LEVELS, totalLevels + 1); double linkScore = 0.0f; if (totalTextSize > 0 && totalTextInLinks > 0) { linkScore = (double)totalTextInLinks / (double)totalTextSize; elem.setAttribute(LINK_SCORE, String.valueOf(linkScore)); } double textNodeScore = 0.0d; if (totalTextSize > 0 && totalElementNodes > 0) { textNodeScore = (double)totalTextSize * (1.0 - linkScore); elem.setAttribute(TEXT_NODE_SCORE, String.valueOf(textNodeScore)); } else if (totalTextSize > 0) { elem.setAttribute(TEXT_NODE_SCORE, String.valueOf(1.0)); } return values; } return null; } private int positionNodes(int level, Content node) { int pos = level; // don't go on forever, spider traps can kill JVM through stack overflow if (level == maxRecurseDepth) { return pos; } if (node instanceof Text) { return --level; } else if (node instanceof Element) { Element elem = (Element)node; elem.setAttribute(POSITION, String.valueOf(level)); List<Content> children = elem.getContent(); if (children != null && children.size() > 0) { for (Content child : children) { level = positionNodes(level + 1, child); } } } return level; } private void extractMediaElements(int level, Content node, List<Element> media) { // don't go on forever, spider traps can kill JVM through stack overflow if (level == maxRecurseDepth) { return; } if (node instanceof Element) { Element elem = (Element)node; String name = StringUtils.lowerCase(elem.getName()); if (mediaElements.contains(name)) { media.add(elem); } else { List<Content> children = elem.getContent(); if (children != null && children.size() > 0) { for (Content child : children) { extractMediaElements(++level, child, media); } } } } } private void removeTempAttrs(int level, Content node) { // don't go on forever, spider traps can kill JVM through stack overflow if (level == maxRecurseDepth) { return; } if (node instanceof Text) { return; } else if (node instanceof Element) { Element elem = (Element)node; removeProcessingAttributes(elem); List<Content> children = elem.getContent(); if (children != null && children.size() > 0) { for (Content child : children) { removeTempAttrs(level + 1, child); } } } } @Override public boolean processContent(Document doc) { Element rootElem = doc.getRootElement(); Set<Content> remove = new LinkedHashSet<Content>(); // remove non-content nodes List<Content> contents = rootElem.getContent(); for (Content child : contents) { cleanNonContentNodes(0, child, remove); } for (Content content : remove) { content.getParent().removeContent(content); } // remove structure nodes remove.clear(); contents = rootElem.getContent(); for (Content child : contents) { cleanStructureNodes(0, child, remove); } for (Content content : remove) { content.getParent().removeContent(content); } // position and score nodes contents = rootElem.getContent(); for (Content child : contents) { positionNodes(0, child); } for (Content child : contents) { scoreNodes(0, child); } // create the holder for article node List<Content> articleNodes = new ArrayList<Content>(); // find the highest scoring content area if (articleNodes.size() == 0) { for (Content child : contents) { findHighestNodeScore(0, child, articleNodes); } } // extract media elements from highest section List<Element> media = new ArrayList<Element>(); if (extractMedia) { for (Content node : articleNodes) { extractMediaElements(0, node, media); } } // remove non-text nodes remove.clear(); for (Content child : articleNodes) { cleanNonTextNodes(0, child, remove); } for (Content content : remove) { content.getParent().removeContent(content); } // remove temporary attributes from all nodes contents = rootElem.getContent(); for (Content child : contents) { removeTempAttrs(0, child); } // remove temp attributes from media, theses could have gotten detached // and not be in the node tree cleaned above for (Element child : media) { removeTempAttrs(0, child); addExtractedValue("media", XMLUtils.toHtml(child)); } if (articleNodes != null && articleNodes.size() > 0) { Element article = (Element)articleNodes.get(0); if (article != null) { if (extractHtml) { addExtractedValue("html", XMLUtils.toHtml(article)); } if (extractText) { addExtractedValue("text", XMLUtils.toText(article)); } } for (Content connected : articleNodes) { connected.detach(); } rootElem.setContent(articleNodes); } return true; } public boolean isExtractHtml() { return extractHtml; } public void setExtractHtml(boolean extractHtml) { this.extractHtml = extractHtml; } public boolean isExtractText() { return extractText; } public void setExtractText(boolean extractText) { this.extractText = extractText; } public boolean isExtractMedia() { return extractMedia; } public void setExtractMedia(boolean extractMedia) { this.extractMedia = extractMedia; } public int getMaxRecurseDepth() { return maxRecurseDepth; } public void setMaxRecurseDepth(int maxRecurseDepth) { this.maxRecurseDepth = maxRecurseDepth; } }