package org.meaningfulweb.cext.processors; import java.util.Arrays; import java.util.HashMap; import java.util.HashSet; import java.util.LinkedHashSet; import java.util.List; import java.util.Map; import java.util.Set; import org.meaningfulweb.cext.HtmlContentProcessor; import org.meaningfulweb.util.XMLUtils; import org.apache.commons.lang.StringUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.jdom.Comment; import org.jdom.Content; import org.jdom.Document; import org.jdom.Element; import org.jdom.Text; public class MainContentProcessor extends HtmlContentProcessor { public static final Log LOG = LogFactory.getLog(MainContentProcessor.class); private double threshold = 10f; private int minTextLength = 20; private int minWords = 0; private int minLinks = 2; private int maxRecurseDepth = 250; private double linkListThreshold = .70f; private boolean extractHtml = true; private boolean extractText = true; private boolean isContainerSetup(int linkCount, int cntrCount) { if (cntrCount <= linkCount) { return ((float)cntrCount / (float)linkCount) > linkListThreshold; } else { return ((float)linkCount / (float)cntrCount) > linkListThreshold; } } private static Set<String> removeElements = new HashSet<String>(); static { String[] remove = {"head", "script", "noscript", "style", "form", "meta", "input", "iframe", "embed", "hr", "img", "link", "label"}; removeElements.addAll(Arrays.asList(remove)); } private static Set<String> containerElements = new HashSet<String>(); static { String[] container = {"div", "table", "td", "th", "tr", "tbody", "thead", "tfoot", "col", "colgroup", "ul", "ol", "li", "html", "center", "span"}; containerElements.addAll(Arrays.asList(container)); } private void cleanNodes(int level, Content node, Set<Content> remove) { // don't go on forever, spider traps can kill JVM through stack overflow if (level == maxRecurseDepth) { return; } if (node instanceof Element) { Element elem = (Element)node; String name = StringUtils.lowerCase(elem.getName()); if (removeElements.contains(name)) { remove.add(node); } List<Content> children = elem.getContent(); if (children != null && children.size() > 0) { for (Content child : children) { cleanNodes(++level, child, remove); } } } else if (node instanceof Comment) { remove.add(node); } } private Map cleanLinkContainers(int level, Content node, Set<Content> remove) { // don't go on forever, spider traps can kill JVM through stack overflow if (node == null || level == maxRecurseDepth) { return null; } int linkCount = 0; int liCount = 0; int textLength = 0; int wordCount = 0; int nodeCount = 0; int hTagCount = 0; boolean delete = false; float ratio = 0.0f; float totalRatio = 0.0f; boolean linkContainer = false; if (node instanceof Element) { nodeCount++; Element elem = (Element)node; String name = StringUtils.lowerCase(elem.getName()); List<Content> children = elem.getContent(); if (children != null && children.size() > 0) { for (Content child : children) { Map data = cleanLinkContainers(++level, child, remove); if (data != null) { boolean deleteChild = (Boolean)data.get("delete"); if (deleteChild) { remove.add(child); } else { textLength += (Integer)data.get("textLength"); } linkCount += (Integer)data.get("linkCount"); wordCount += (Integer)data.get("wordCount"); nodeCount += (Integer)data.get("nodeCount"); totalRatio += (Float)data.get("ratio"); liCount += (Integer)data.get("liCount"); hTagCount += (Integer)data.get("hTagCount"); } } } if (name.equalsIgnoreCase("a")) { linkCount++; } else if (name.equalsIgnoreCase("li")) { liCount++; } else if (name.equalsIgnoreCase("ul")) { if (linkCount >= 2 && liCount >= 2 && isContainerSetup(linkCount, liCount)) { // heuristic for lists of links linkContainer = true; liCount = 0; } } else if (name.equalsIgnoreCase("h1") || name.equalsIgnoreCase("h2") || name.equalsIgnoreCase("h3") || name.equalsIgnoreCase("h4") || name.equalsIgnoreCase("h5")) { hTagCount++; } else if ((name.equalsIgnoreCase("div") || name.equalsIgnoreCase("p")) && linkCount >= 3 && hTagCount >= 3 && isContainerSetup(linkCount, hTagCount)) { // heuristic for things that look like lists of links linkContainer = true; hTagCount = 0; } if (containerElements.contains(name)) { if (wordCount == 0) { delete = true; } else { float linkDenom = (float)linkCount > 0 ? linkCount : 1; ratio = (float)wordCount / linkDenom; totalRatio += ratio; } if (linkCount > minLinks && ratio < threshold && totalRatio < threshold) { delete = true; } if (linkContainer) { delete = true; } if (!name.equalsIgnoreCase("span")) { if (textLength < minTextLength || wordCount < minWords) { delete = true; } } } } else if (node instanceof Text) { Text text = (Text)node; String normalized = text.getTextNormalize(); if (StringUtils.isNotBlank(normalized)) { boolean hasText = StringUtils.isNotBlank(normalized); if (hasText) { textLength += normalized.length(); wordCount += StringUtils.split(normalized).length; } } } Map output = new HashMap(); output.put("textLength", textLength); output.put("wordCount", wordCount); output.put("linkCount", linkCount); output.put("nodeCount", nodeCount); output.put("hTagCount", hTagCount); output.put("liCount", liCount); output.put("ratio", ratio); output.put("delete", delete); return output; } @Override public boolean processContent(Document doc) { Set<Content> remove = new LinkedHashSet<Content>(); Element rootElem = doc.getRootElement(); // remove specific non-content elements List<Content> contents = rootElem.getContent(); for (Content child : contents) { cleanNodes(0, child, remove); } for (Content content : remove) { content.getParent().removeContent(content); } // clear the remove set remove.clear(); // remove link container elements List<Content> containerContents = rootElem.getContent(); for (Content child : containerContents) { cleanLinkContainers(0, child, remove); } for (Content content : remove) { content.getParent().removeContent(content); } // add the content html if (extractHtml) { String contentHtml = XMLUtils.toHtml(doc); // return full html if content html is empty if (StringUtils.isNotBlank(contentHtml)) { addExtractedValue("html", contentHtml); } } // get the content text if (extractText) { String contentText = XMLUtils.toText(doc); // return full text if the content text is empty if (StringUtils.isNotBlank(contentText)) { addExtractedValue("text", contentText); } } return true; } public double getThreshold() { return threshold; } public void setThreshold(double threshold) { this.threshold = threshold; } public int getMinTextLength() { return minTextLength; } public void setMinTextLength(int minTextLength) { this.minTextLength = minTextLength; } public int getMinWords() { return minWords; } public void setMinWords(int minWords) { this.minWords = minWords; } public int getMinLinks() { return minLinks; } public void setMinLinks(int minLinks) { this.minLinks = minLinks; } public int getMaxRecurseDepth() { return maxRecurseDepth; } public void setMaxRecurseDepth(int maxRecurseDepth) { this.maxRecurseDepth = maxRecurseDepth; } public double getLinkListThreshold() { return linkListThreshold; } public void setLinkListThreshold(double linkListThreshold) { this.linkListThreshold = linkListThreshold; } public boolean isExtractHtml() { return extractHtml; } public void setExtractHtml(boolean extractHtml) { this.extractHtml = extractHtml; } public boolean isExtractText() { return extractText; } public void setExtractText(boolean extractText) { this.extractText = extractText; } }