package org.meaningfulweb.cext.processors; import org.meaningfulweb.cext.HtmlContentProcessor; import org.meaningfulweb.util.XMLUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.jdom.Document; public class FullContentProcessor extends HtmlContentProcessor { public static final Log LOG = LogFactory.getLog(FullContentProcessor.class); private boolean extractHtml = true; private boolean extractText = true; public FullContentProcessor() { } public boolean isExtractHtml() { return extractHtml; } public void setExtractHtml(boolean extractHtml) { this.extractHtml = extractHtml; } public boolean isExtractText() { return extractText; } public void setExtractText(boolean extractText) { this.extractText = extractText; } @Override public boolean processContent(Document document) { // add the full html of the document if (extractHtml) { String fullHtml = XMLUtils.toHtml(document); addExtractedValue("html", fullHtml); } // add the full text of the document if (extractText) { String fullText = XMLUtils.toText(document); addExtractedValue("text", fullText); } return true; } }