/*
* Copyright 2008-2011 the original author or authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.nominanuda.html;
import java.io.CharArrayWriter;
import java.io.Reader;
import java.io.Writer;
import javax.xml.transform.sax.SAXResult;
import javax.xml.transform.sax.SAXSource;
import javax.xml.transform.sax.TransformerHandler;
import org.xml.sax.InputSource;
import com.nominanuda.web.html.HtmlFragmentParser;
import com.nominanuda.web.html.XHtml5Serializer;
import com.nominanuda.zen.common.InstanceFactory;
import com.nominanuda.zen.xml.HtmlPurifyTransformer;
import com.nominanuda.zen.xml.SAXPipeline;
import com.nominanuda.zen.xml.TextSelectTransformer;
import com.nominanuda.zen.xml.WhiteSpaceNormalizingTransformer;
import nu.validator.htmlparser.sax.HtmlParser;
public class HtmlSanitizer {
public String cleanHtml(Reader r) {
CharArrayWriter caw = new CharArrayWriter();
cleanHtml(r, caw);
return caw.toString();
}
public String htmlToText(Reader r) {
CharArrayWriter caw = new CharArrayWriter();
htmlToText(r, caw);
return caw.toString();
}
public void htmlToText(Reader source, Writer sink) {
runHtmlPipeline(source, sink,
new WhiteSpaceNormalizingTransformer(),
new TextSelectTransformer());
}
public void cleanHtml(Reader source, Writer sink) {
runHtmlPipeline(source, sink,
new WhiteSpaceNormalizingTransformer(),
new HtmlPurifyTransformer());
}
public void runHtmlPipeline(Reader source, Writer sink, TransformerHandler... tx) {
HtmlParser parser = new HtmlParser();
parser.setMappingLangToXmlLang(true);
parser.setReportingDoctype(false);
SAXSource src = new SAXSource(new HtmlFragmentParser(parser), new InputSource(source));
SAXResult snk = new SAXResult(new XHtml5Serializer(sink));
SAXPipeline pipe = new SAXPipeline();
for(TransformerHandler t : tx) {
pipe.add(new InstanceFactory<TransformerHandler>(t));
}
pipe.complete().build(src, snk).run();
}}