package com.enonic.cms.plugin.extractor; import java.io.IOException; import java.io.InputStream; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.nodes.TextNode; import com.enonic.cms.api.plugin.ext.TextExtractor; public class HtmlExtractor extends TextExtractor { @Override public boolean canHandle( final String mimeType ) { return "text/html".equals( mimeType ); } @Override public String extractText( final String mimeType, final InputStream inputStream, final String encoding ) throws IOException { if ( !canHandle( mimeType ) ) { return null; } StringBuilder builder = new StringBuilder(); Document doc = Jsoup.parse( inputStream, encoding, "" ); for ( Element element : doc.getAllElements() ) { for ( TextNode textNode : element.textNodes() ) { final String text = textNode.text(); builder.append( text ); appendWhitespaceAfterTextIfNotThere( builder, text ); } } return builder.toString(); } private void appendWhitespaceAfterTextIfNotThere( final StringBuilder builder, final String text ) { if ( text != null && !text.isEmpty() && !text.endsWith( " " ) ) { builder.append( " " ); } } }