package org.meaningfulweb.cext; import java.util.Map; import java.util.Set; import org.apache.commons.lang.BooleanUtils; import org.apache.tika.parser.txt.CharsetDetector; import org.htmlcleaner.CleanerProperties; import org.htmlcleaner.HtmlCleaner; import org.htmlcleaner.JDomSerializer; import org.htmlcleaner.TagNode; import org.jdom.Document; import org.meaningfulweb.util.EncodingUtils; import org.meaningfulweb.util.XMLUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; public class HtmlExtractor { private final static Logger LOG = LoggerFactory .getLogger(HtmlExtractor.class); // config variables affecting operation public static final String perComponentDOM = "perComponentDOM"; public static final String perPipelineDOM = "perPipelineDOM"; private HtmlContentProcessorFactory processorFactory; public void extract(Extract extract) throws Exception { // get the character set byte[] content = extract.getContent(); CharsetDetector charDetect = new CharsetDetector(); charDetect.setText(content); String charSet = charDetect.detect().getName(); // clean the content of invalid xml characters String unclean = EncodingUtils.getEncodedString(content, charSet); String contentStr = XMLUtils.stripNonValidXMLCharacters(unclean); // setup the cleaner HtmlCleaner cleaner = new HtmlCleaner(); CleanerProperties props = cleaner.getProperties(); props.setUseCdataForScriptAndStyle(false); props.setOmitComments(true); props.setOmitUnknownTags(true); props.setOmitDoctypeDeclaration(true); props.setOmitXmlDeclaration(true); props.setRecognizeUnicodeChars(false); props.setAdvancedXmlEscape(true); props.setTranslateSpecialEntities(false); props.setNamespacesAware(false); props.setAllowHtmlInsideAttributes(true); props.setAllowMultiWordAttributes(true); // clean the html and serialize it to jdom TagNode nodes = cleaner.clean(contentStr); ExtractUtils.cleanInvalidAttributes(nodes); Document doc; try { doc = new JDomSerializer(props, true).createJDom(nodes); } catch (Exception e) { LOG.error("Error extracting content.", e); return; } // metadata, runtime config and extract Map<String, Object> runtime = extract.getConfig(); Map<String, Object> metadata = extract.getMetadata(); Map<String, Object> extracted = extract.getExtracted(); // one or more components in a given order Set<String> compNames = extract.getComponents(); if (compNames != null && compNames.size() > 0) { for (String compName : compNames) { // components can share dom or each can get a clean copy Document tempDoc = doc; boolean isPerComponentDOM = BooleanUtils.toBoolean((Boolean)runtime .get(perComponentDOM)); if (isPerComponentDOM) { tempDoc = new Document(); tempDoc.addContent(doc.cloneContent()); } HtmlContentProcessor processor = processorFactory.getComponent(compName, runtime); processor.setMetadata(metadata); processor.processContent(tempDoc); Map<String, Object> curExtract = processor.getExtracted(); for (String key : curExtract.keySet()) { String fullname = processor.getName() + "." + key; Object extractVal = curExtract.get(key); extracted.put(fullname, extractVal); } } } // one or more pipelines in a given order Set<String> plNames = extract.getPipelines(); if (plNames != null && plNames.size() > 0) { for (String name : plNames) { // pipelines can share dom or each can get a clean copy Document tempDoc = doc; boolean isPerPipelineDOM = BooleanUtils.toBoolean((Boolean)runtime .get(perPipelineDOM)); if (isPerPipelineDOM) { tempDoc = new Document(); tempDoc.addContent(doc.cloneContent()); } HtmlContentPipeline pipeline = processorFactory.getPipeline(name, runtime); if (pipeline != null) { pipeline.setMetadata(metadata); Map<String, Object> plOutput = pipeline.processPipeline(tempDoc); extracted.putAll(plOutput); } } } } public void setProcessorFactory(HtmlContentProcessorFactory processorFactory) { this.processorFactory = processorFactory; } }