package org.meaningfulweb.cext.processors; import java.io.StringReader; import org.jdom.Document; import org.jdom.output.XMLOutputter; import org.meaningfulweb.cext.HtmlContentProcessor; import org.xml.sax.InputSource; import de.l3s.boilerpipe.document.TextDocument; import de.l3s.boilerpipe.extractors.ArticleExtractor; import de.l3s.boilerpipe.extractors.ArticleSentencesExtractor; import de.l3s.boilerpipe.extractors.ExtractorBase; import de.l3s.boilerpipe.sax.BoilerpipeSAXInput; public class BoilerpipeArticleProcessor extends HtmlContentProcessor { private boolean tuneForSentences = true; private ExtractorBase extractor = new ArticleSentencesExtractor(); public boolean isTuneForSentences() { return tuneForSentences; } public void setTuneForSentences(boolean tuneForSentences) { this.tuneForSentences = tuneForSentences; if (tuneForSentences){ extractor = ArticleSentencesExtractor.INSTANCE; } else{ extractor = ArticleExtractor.INSTANCE; } } @Override public boolean processContent(Document document) { try{ XMLOutputter outputter = new XMLOutputter(); String xml = outputter.outputString(document.getRootElement()); BoilerpipeSAXInput saxinput = new BoilerpipeSAXInput(new InputSource(new StringReader(xml))); TextDocument textDoc = saxinput.getTextDocument(); String text = extractor.getText(textDoc); addExtractedValue("text", text); return true; } catch(Exception e){ return false; } } }