/**
* <a href="http://www.openolat.org">
* OpenOLAT - Online Learning and Training</a><br>
* <p>
* Licensed under the Apache License, Version 2.0 (the "License"); <br>
* you may not use this file except in compliance with the License.<br>
* You may obtain a copy of the License at the
* <a href="http://www.apache.org/licenses/LICENSE-2.0">Apache homepage</a>
* <p>
* Unless required by applicable law or agreed to in writing,<br>
* software distributed under the License is distributed on an "AS IS" BASIS, <br>
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. <br>
* See the License for the specific language governing permissions and <br>
* limitations under the License.
* <p>
* Initial code contributed and copyrighted by<br>
* frentix GmbH, http://www.frentix.com
* <p>
*/
package org.olat.core.util.filter.impl;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringReader;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Set;
import org.cyberneko.html.parsers.SAXParser;
import org.olat.core.logging.OLog;
import org.olat.core.logging.Tracing;
import org.olat.core.util.filter.Filter;
import org.olat.core.util.io.LimitedContentWriter;
import org.olat.search.service.document.file.FileDocumentFactory;
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
/**
* Description:<br>
* Filter the HTML code using Neko SAX parser and extract the content.
* Neko parse the HTML entities too and deliver cleaned text.
*
* <P>
* Initial Date: 2 dec. 2009 <br>
* @author srosse
*/
public class NekoHTMLFilter implements Filter {
private static final OLog log = Tracing.createLoggerFor(NekoHTMLFilter.class);
public static final Set<String> blockTags = new HashSet<String>();
static {
blockTags.addAll(Arrays.asList("address","blockquote","br","dir","div","dl","fieldset","form","h1","h2","h3","h4","h5","h6","hr","noframes","noscript","ol","p","pre","table","ul","li"));
}
@Override
public String filter(String original) {
return filter(original, false);
}
public String filter(String original, boolean pretty) {
if(original == null) return null;
if(original.isEmpty()) return "";
try {
SAXParser parser = new SAXParser();
HTMLHandler contentHandler = new HTMLHandler((int)(original.length() * 0.66f), pretty);
parser.setContentHandler(contentHandler);
parser.parse(new InputSource(new StringReader(original)));
return contentHandler.toString();
} catch (SAXException e) {
log.error("", e);
return null;
} catch (IOException e) {
log.error("", e);
return null;
} catch (Exception e) {
log.error("", e);
return null;
}
}
public NekoContent filter(InputStream in) {
if (in == null) return null;
try {
SAXParser parser = new SAXParser();
HTMLHandler contentHandler = new HTMLHandler((int)(1000 * 0.66f), false);
parser.setContentHandler(contentHandler);
parser.parse(new InputSource(in));
return contentHandler.getContent();
} catch (SAXException e) {
log.error("", e);
return null;
} catch (IOException e) {
log.error("", e);
return null;
} catch (Exception e) {
log.error("", e);
return null;
}
}
public static class NekoContent {
private final String title;
private final LimitedContentWriter content;
public NekoContent(String title, LimitedContentWriter content) {
this.title = title;
this.content = content;
}
public String getTitle() {
return title;
}
public String getContent() {
return content.toString();
}
}
private static class HTMLHandler extends DefaultHandler {
private boolean collect = true;
private boolean consumeBlanck = false;
private boolean consumeTitle = true;
private final boolean pretty;
private final LimitedContentWriter content;
private final StringBuilder title;
public HTMLHandler(int size, boolean pretty) {
this.pretty = pretty;
content = new LimitedContentWriter(size, FileDocumentFactory.getMaxFileSize());
title = new StringBuilder(32);
}
@Override
public void startElement(String uri, String localName, String qName, Attributes attributes) {
String elem = localName.toLowerCase();
if("script".equals(elem)) {
collect = false;
// add a single whitespace before each block element but only if not there is not already a whitespace there
} else {
if(pretty) {
if("li".equals(elem)) {
content.append("\u00B7 ");
} else if("br".equals(elem)) {
content.append('\n');
}
}
if("title".equals(elem)) {
consumeTitle = true;
}
if(blockTags.contains(elem) && content.length() > 0 && content.charAt(content.length() -1) != ' ' ) {
consumeBlanck = true;
}
}
}
@Override
public void characters(char[] chars, int offset, int length) {
if(collect) {
if(consumeBlanck) {
if(content.length() > 0 && content.charAt(content.length() -1) != ' ' && length > 0 && chars[offset] != ' ') {
content.append(' ');
}
consumeBlanck = false;
}
content.write(chars, offset, length);
if(consumeTitle) {
title.append(chars, offset, length);
}
}
}
@Override
public void endElement(String uri, String localName, String qName) {
String elem = localName.toLowerCase();
if("script".equals(elem)) {
collect = true;
} else {
if(pretty && ("li".equals(elem) || "p".equals(elem))) {
content.append('\n');
}
if("title".equals(elem)) {
consumeTitle = false;
}
if(blockTags.contains(elem) && content.length() > 0 && content.charAt(content.length() -1) != ' ' ) {
consumeBlanck = true;
}
}
}
public NekoContent getContent() {
return new NekoContent(title.toString(), content);
}
@Override
public String toString() {
return content.toString();
}
}
}