package org.meaningfulweb.cext.processors; import java.util.ArrayList; import java.util.Collection; import java.util.HashSet; import java.util.LinkedHashSet; import java.util.List; import java.util.Set; import org.meaningfulweb.cext.HtmlContentProcessor; import org.meaningfulweb.util.XMLUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.jdom.Comment; import org.jdom.Document; import org.jdom.Element; import org.jdom.JDOMException; import org.jdom.Text; import org.jdom.xpath.XPath; public class XPathProcessor extends HtmlContentProcessor { public static final Log LOG = LogFactory.getLog(XPathProcessor.class); private Set<String> xpaths = new LinkedHashSet<String>(); private boolean cleaning = false; private boolean extractHtml = true; private boolean extractText = true; @Override public boolean processContent(Document document) { if (xpaths != null && xpaths.size() > 0) { List<Element> selectedList = new ArrayList<Element>(); for (String xpath : xpaths) { List selNodes = null; try { XPath xp = XPath.newInstance(xpath); selNodes = xp.selectNodes(document); } catch (JDOMException e) { e.printStackTrace(); } if (selNodes != null && selNodes.size() > 0) { for (Object objNode : selNodes) { if (objNode instanceof Element) { Element selElem = (Element)objNode; if (extractHtml) { addExtractedValue(xpath, XMLUtils.toHtml(selElem)); } if (extractText) { addExtractedValue(xpath + ".text", XMLUtils.toText(selElem)); } if (cleaning) { selectedList.add(selElem); } } else if (objNode instanceof Text) { addExtractedValue(xpath, ((Text)objNode).getTextNormalize()); } else if (objNode instanceof Comment) { addExtractedValue(xpath, ((Comment)objNode).getText()); } } } } if (cleaning && selectedList.size() > 0) { Element root = document.getRootElement(); for (Element connected : selectedList) { connected.detach(); } root.setContent(selectedList); } } return true; } public Collection<String> getXpaths() { return xpaths; } public void setXpaths(Collection<String> xpaths) { if (xpaths != null) { if (xpaths instanceof Set) { this.xpaths = (Set<String>)xpaths; } else { Set<String> newXpaths = new HashSet<String>(); newXpaths.addAll(xpaths); this.xpaths = newXpaths; } } } public boolean isExtractHtml() { return extractHtml; } public void setExtractHtml(boolean extractHtml) { this.extractHtml = extractHtml; } public boolean isExtractText() { return extractText; } public void setExtractText(boolean extractText) { this.extractText = extractText; } public boolean isCleaning() { return cleaning; } public void setCleaning(boolean cleaning) { this.cleaning = cleaning; } }