package org.meaningfulweb.cext.processors; import java.util.Collection; import java.util.LinkedHashSet; import java.util.List; import java.util.Set; import org.meaningfulweb.cext.HtmlContentProcessor; import org.meaningfulweb.util.JDomUtils; import org.meaningfulweb.util.URLUtil; import org.apache.commons.lang.StringUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.jdom.Content; import org.jdom.Document; import org.jdom.Element; public class HyperlinkProcessor extends HtmlContentProcessor { public static final Log LOG = LogFactory.getLog(HyperlinkProcessor.class); private Set<String> extensions = new LinkedHashSet<String>(); private int maxRecurseDepth = 250; private void extractLinkExtensions(int level, Content node) { // don't go on forever, spider traps can kill JVM through stack overflow if (node == null || level == maxRecurseDepth) { return; } if (node instanceof Element) { Element elem = (Element)node; String name = StringUtils.lowerCase(elem.getName()); if (StringUtils.equalsIgnoreCase(name, "a")) { String href = JDomUtils.getAttributeValue(elem, "href"); if (StringUtils.isNotBlank(href)) { String page = URLUtil.getPage(href); String extension = URLUtil.getExtension(page); if (extensions.contains(extension)) { addExtractedValue(extension, href); } } } List<Content> children = elem.getContent(); if (children != null && children.size() > 0) { for (Content child : children) { extractLinkExtensions(++level, child); } } } } public Collection<String> getExtensions() { return extensions; } public void setExtensions(Collection<String> extensions) { if (extensions != null) { if (extensions instanceof Set) { this.extensions = (Set<String>)extensions; } else { Set<String> newExtensions = new LinkedHashSet<String>(); newExtensions.addAll(extensions); this.extensions = newExtensions; } } } public int getMaxRecurseDepth() { return maxRecurseDepth; } public void setMaxRecurseDepth(int maxRecurseDepth) { this.maxRecurseDepth = maxRecurseDepth; } @Override public boolean processContent(Document document) { Element rootElem = document.getRootElement(); List<Content> contents = rootElem.getContent(); for (Content child : contents) { extractLinkExtensions(0, child); } return true; } }