/**
* Copyright (C) 2010 Orbeon, Inc.
*
* This program is free software; you can redistribute it and/or modify it under the terms of the
* GNU Lesser General Public License as published by the Free Software Foundation; either version
* 2.1 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
* See the GNU Lesser General Public License for more details.
*
* The full text of the license is available at http://www.gnu.org/copyleft/lesser.html
*/
package org.orbeon.oxf.processor;
import org.apache.log4j.Logger;
import org.orbeon.oxf.common.OXFException;
import org.orbeon.oxf.common.ValidationException;
import org.orbeon.oxf.xml.*;
import org.orbeon.oxf.processor.generator.TidyConfig;
import org.orbeon.oxf.processor.serializer.CachedSerializer;
import org.orbeon.oxf.util.LoggerFactory;
import org.orbeon.oxf.xml.XMLParsing;
import org.orbeon.oxf.xml.dom4j.LocationData;
import org.w3c.dom.Document;
import org.w3c.tidy.Tidy;
import org.xml.sax.InputSource;
import org.xml.sax.SAXParseException;
import org.xml.sax.XMLReader;
import javax.xml.transform.dom.DOMSource;
import java.io.*;
/**
* Intercept either an OutputStream or a Writer.
*
* This implementation holds a buffer for either a Writer or an Output Stream. The buffer can then
* be parsed.
*/
public class StreamInterceptor {
private static Logger logger = LoggerFactory.createLogger(StreamInterceptor.class);
private StringWriter writer;
private ByteArrayOutputStream byteStream;
private String encoding = CachedSerializer.DEFAULT_ENCODING;
private String contentType = ProcessorUtils.DEFAULT_CONTENT_TYPE;
public Writer getWriter() {
if (byteStream != null)
throw new IllegalStateException("getWriter is called after getOutputStream was already called.");
if (writer == null)
writer = new StringWriter();
return writer;
}
public OutputStream getOutputStream() {
if (writer != null)
throw new IllegalStateException("getOutputStream is called after getWriter was already called.");
if (byteStream == null)
byteStream = new ByteArrayOutputStream();
return byteStream;
}
public void setEncoding(String encoding) {
this.encoding = encoding;
}
public void setContentType(String contentType) {
this.contentType = contentType;
}
public void parse(XMLReceiver xmlReceiver, TidyConfig tidyConfig, boolean fragment) {
try {
// Create InputSource
InputSource inputSource = null;
String stringContent = null;
if (writer != null) {
stringContent = writer.toString();
if (stringContent.length() > 0) {
if (logger.isDebugEnabled()) {
logger.debug("Document to parse in filter: ");
logger.debug(stringContent);
}
inputSource = new InputSource(new StringReader(stringContent));
}
} else if (byteStream != null) {
byte[] byteContent = byteStream.toByteArray();
if (byteContent.length > 0) {
if (logger.isDebugEnabled()) {
logger.debug("Document to parse in filter: ");
logger.debug(new String(byteContent, encoding));
}
inputSource = new InputSource(new ByteArrayInputStream(byteContent));
if (encoding != null)
inputSource.setEncoding(encoding);
}
} else {
throw new OXFException("Filtered resource did not call getWriter() or getOutputStream().");
}
// Parse the output only if text was generated
if (inputSource != null) {
if (ProcessorUtils.HTML_CONTENT_TYPE.equals(contentType)) {
// The document contains HTML. Parse it using Tidy.
final Tidy tidy = new Tidy();
if (tidyConfig != null) {
tidy.setShowWarnings(tidyConfig.isShowWarnings());
tidy.setQuiet(tidyConfig.isQuiet());
if (tidyConfig.isQuiet())
tidy.setErrout(new PrintWriter(new StringWriter()));
}
final InputStream inputStream;
if (writer == null) {
// Unfortunately, it doesn't look like tidy support
// detecting the encoding from the HTML document, so we
// are left to using a default or hope that the source
// set a known encoding.
inputStream = inputSource.getByteStream();
tidy.setInputEncoding(TidyConfig.getTidyEncoding(encoding));
} else {
// Here we go from characters to bytes to characters
// again, which is very suboptimal, but the version of
// tidy used does not support a Reader as input.
// Use utf-8 both ways and hope for the best
inputStream = new ByteArrayInputStream(stringContent.getBytes("utf-8"));
tidy.setInputEncoding("utf-8");
}
final Document document = tidy.parseDOM(inputStream, null);
// Output the result
if (fragment) {
// Do not generate start and end document events
TransformerUtils.sourceToSAX(new DOMSource(document), new EmbeddedDocumentXMLReceiver(xmlReceiver));
} else {
// Generate a complete document
TransformerUtils.sourceToSAX(new DOMSource(document), xmlReceiver);
}
} else {
// Assume it is XML and parse the output
final XMLReader reader = XMLParsing.newXMLReader(XMLParsing.ParserConfiguration.PLAIN);
if (fragment) {
// Do not generate start and end document events
final XMLReceiver forwardingXMLReceiver = new EmbeddedDocumentXMLReceiver(xmlReceiver);
reader.setContentHandler(forwardingXMLReceiver);
reader.setProperty(XMLConstants.SAX_LEXICAL_HANDLER, forwardingXMLReceiver);
} else {
// Generate a complete document
reader.setContentHandler(xmlReceiver);
reader.setProperty(XMLConstants.SAX_LEXICAL_HANDLER, xmlReceiver);
}
reader.parse(inputSource);
}
}
} catch (SAXParseException e) {
throw new ValidationException(e.getMessage(), new LocationData(e));
} catch (Exception e) {
throw new OXFException(e);
}
}
}