package org.meaningfulweb.cext.processors; import java.util.Collection; import java.util.LinkedHashMap; import java.util.LinkedHashSet; import java.util.List; import java.util.Map; import java.util.Set; import org.meaningfulweb.cext.HtmlContentProcessor; import org.meaningfulweb.util.JDomUtils; import org.meaningfulweb.util.XMLUtils; import org.apache.commons.lang.StringUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.jdom.Comment; import org.jdom.Content; import org.jdom.Document; import org.jdom.Element; public class ElementProcessor extends HtmlContentProcessor { public static final Log LOG = LogFactory.getLog(ElementProcessor.class); private Set<String> elements = new LinkedHashSet<String>(); private Set<String> headers = new LinkedHashSet<String>(); private boolean extractHtml = true; private boolean extractText = true; private int maxRecurseDepth = 250; private void extractFromNodes(int level, Content node) { // don't go on forever, spider traps can kill JVM through stack overflow if (node == null || level == maxRecurseDepth) { return; } if (node instanceof Element) { Element elem = (Element)node; String name = StringUtils.lowerCase(elem.getName()); // extract out elements by name if (elements.contains(name)) { if (extractHtml) { addExtractedValue(name + ".html", XMLUtils.toHtml(elem)); } if (extractText) { addExtractedValue(name, XMLUtils.toText(elem)); } } else if (name.equalsIgnoreCase("meta")) { // parse and extract out meta tag values (headers) by name String metaIdent = JDomUtils.getAttributeValue(elem, "name"); if (metaIdent == null) { metaIdent = JDomUtils.getAttributeValue(elem, "http-equiv"); } if (metaIdent == null) { metaIdent = JDomUtils.getAttributeValue(elem, "property"); } if (metaIdent != null) { metaIdent = StringUtils.trim(StringUtils.lowerCase(metaIdent)); Map<String,Object> extractedMap = this.getExtracted(); // image already extracted if (extractedMap.containsKey("image")){ return; } if (headers.contains(metaIdent)) { String metaValue = JDomUtils.getAttributeValue(elem, "content"); if (StringUtils.isNotBlank(metaValue)) { addExtractedValue(metaIdent, metaValue); } } } } else if (name.equalsIgnoreCase("link")) { // parse and extract link rel values Map<String, String> linkAttrs = new LinkedHashMap<String, String>(); String rel = JDomUtils.getAttributeValue(elem, "rel"); if (StringUtils.isNotBlank(rel)) { linkAttrs.put("rel", rel); } String type = JDomUtils.getAttributeValue(elem, "type"); boolean hasType = StringUtils.isNotBlank(type); if (hasType) { linkAttrs.put("type", type); } String href = JDomUtils.getAttributeValue(elem, "href"); if (StringUtils.isNotBlank(href)) { linkAttrs.put("href", href); } if (headers.contains("link") || headers.contains("link:" + rel) || (hasType && headers.contains("link:" + rel + ":" + type))) { addExtractedValue("link", linkAttrs); } } List<Content> children = elem.getContent(); if (children != null && children.size() > 0) { for (Content child : children) { extractFromNodes(++level, child); } } } else if (node instanceof Comment) { // possibly extract out comments if (elements.contains("comment")) { addExtractedValue("comment", ((Comment)node).getText()); } } } public Collection<String> getElements() { return elements; } public void setElements(Collection<String> elements) { if (elements != null) { if (elements instanceof Set) { this.elements = (Set<String>)elements; } else { Set<String> newElements = new LinkedHashSet<String>(); newElements.addAll(elements); this.elements = newElements; } } } public Collection<String> getHeaders() { return headers; } public void setHeaders(Collection<String> headers) { if (headers != null) { if (headers instanceof Set) { this.headers = (Set<String>)headers; } else { Set<String> newHeaders = new LinkedHashSet<String>(); newHeaders.addAll(headers); this.headers = newHeaders; } } } public boolean isExtractHtml() { return extractHtml; } public void setExtractHtml(boolean extractHtml) { this.extractHtml = extractHtml; } public boolean isExtractText() { return extractText; } public void setExtractText(boolean extractText) { this.extractText = extractText; } public int getMaxRecurseDepth() { return maxRecurseDepth; } public void setMaxRecurseDepth(int maxRecurseDepth) { this.maxRecurseDepth = maxRecurseDepth; } @Override public boolean processContent(Document document) { boolean hasElements = (elements != null && elements.size() > 0); boolean hasHeaders = (headers != null && headers.size() > 0); if (hasElements || hasHeaders) { Element rootElem = document.getRootElement(); List<Content> contents = rootElem.getContent(); for (Content child : contents) { extractFromNodes(0, child); } } return true; } }