package org.tizzit.cocoon.generic.transformation; import java.io.BufferedInputStream; import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.PrintWriter; import java.io.StringWriter; import java.util.HashMap; import java.util.Map; import java.util.Properties; import java.util.StringTokenizer; import javax.xml.transform.OutputKeys; import org.apache.avalon.framework.configuration.Configurable; import org.apache.avalon.framework.configuration.Configuration; import org.apache.avalon.framework.configuration.ConfigurationException; import org.apache.avalon.framework.parameters.Parameters; import org.apache.avalon.framework.service.ServiceException; import org.apache.avalon.framework.service.ServiceManager; import org.apache.cocoon.ProcessingException; import org.apache.cocoon.configuration.Settings; import org.apache.cocoon.environment.SourceResolver; import org.apache.cocoon.transformation.AbstractSAXTransformer; import org.apache.cocoon.xml.IncludeXMLConsumer; import org.apache.cocoon.xml.XMLUtils; import org.apache.excalibur.source.Source; import org.apache.log4j.Logger; import org.tizzit.util.tidy.Tidy; import org.xml.sax.Attributes; import org.xml.sax.SAXException; /** * <b>Patched version of <code>org.apache.cocoon.transformation.HTMLTransformer</code>.</b><br/> * Converts (escaped) HTML snippets into JTidied HTML. * This transformer expects a list of elements, passed as comma separated * values of the "tags" parameter. It records the text enclosed in such * elements and pass it thru JTidy to obtain valid XHTML. * * <p>TODO: Add namespace support. * <p><strong>WARNING:</strong> This transformer should be considered unstable. * * @cocoon.sitemap.component.documentation * Converts (escaped) HTML snippets into JTidied HTML. * This transformer expects a list of elements, passed as comma separated * values of the "tags" parameter. It records the text enclosed in such * elements and pass it thru JTidy to obtain valid XHTML. * @cocoon.sitemap.component.documentation.caching Not Implemented * * @version $Id$ */ public class TidyHTMLTransformer extends AbstractSAXTransformer implements Configurable { private static final Logger log = Logger.getLogger(TidyHTMLTransformer.class); private static final String PARAM_NAME_ENCODING = "encoding"; /** * The properties. */ protected Properties properties = new Properties(); /** * Tags that must be normalized */ private Map<String, String> tags; /** * The encoding. * <p> * A parameter could be used, to set the encoding <code><map:parameter name="encoding" value="UTF-8"/></code>.<br/> * Default value: <code>org.apache.cocoon.containerencoding</code>.<br/> * Fallback value if an error occurred: <code>UTF-8</code> * </p> * <p> * <b>Please make sure, that you use the same encoding in your tidy.properties (<code>char-encoding=utf8</code>).</b> * </p> */ protected String defaultEncoding = "UTF-8"; /** * React on endElement calls that contain a tag to be * tidied and run Jtidy on it, otherwise passthru. * * @see org.xml.sax.ContentHandler#endElement(java.lang.String, java.lang.String, java.lang.String) */ public void endElement(String uri, String name, String raw) throws SAXException { if (this.tags.containsKey(name)) { String toBeNormalized = this.endTextRecording(); try { this.normalize(toBeNormalized); } catch (ProcessingException exe) { exe.printStackTrace(); } } super.endElement(uri, name, raw); } /** * Start buffering text if inside a tag to be normalized, * passthru otherwise. * * @see org.xml.sax.ContentHandler#startElement(java.lang.String, java.lang.String, java.lang.String, org.xml.sax.Attributes) */ public void startElement(String uri, String name, String raw, Attributes attr) throws SAXException { super.startElement(uri, name, raw, attr); if (this.tags.containsKey(name)) { this.startTextRecording(); } } /** * Configure this transformer, possibly passing to it * a jtidy configuration file location. * * @deprecated use property injection instead */ public void configure(Configuration config) throws ConfigurationException { super.configure(config); String configUrl = config.getChild("jtidy-config").getValue(null); try { if (this.resolver == null) { this.resolver = (SourceResolver) this.manager.lookup(SourceResolver.ROLE); } loadJTidyProperties(configUrl, this.resolver); } catch (Exception exe) { log.error(exe.getMessage(), exe); } } /** * The beef: run JTidy on the buffered text and stream * the result * * @param text the string to be tidied */ private void normalize(String text) throws ProcessingException { try { // Setup an instance of Tidy. Tidy tidy = new Tidy(); tidy.setXmlOut(true); if (this.properties == null) { tidy.setXHTML(true); } else { tidy.setConfigurationFromProps(this.properties); } //Set Jtidy final result summary on-off tidy.setQuiet(!log.isInfoEnabled()); //Set Jtidy infos to a String (will be logged) instead of System.out StringWriter stringWriter = new StringWriter(); PrintWriter errorWriter = new PrintWriter(stringWriter); tidy.setErrout(errorWriter); // Extract the document using JTidy and stream it. ByteArrayInputStream bais = new ByteArrayInputStream(text.getBytes(this.defaultEncoding)); org.w3c.dom.Document doc = tidy.parseDOM(new BufferedInputStream(bais), null); // FIXME: Jtidy doesn't warn or strip duplicate attributes in same // tag; stripping. XMLUtils.stripDuplicateAttributes(doc, null); errorWriter.flush(); errorWriter.close(); if (tidy.getShowWarnings()) { log.warn(stringWriter.toString()); } IncludeXMLConsumer.includeNode(doc, this.contentHandler, this.lexicalHandler); } catch (Exception exe) { throw new ProcessingException("Exception in TidyHTMLTransformer.normalize()", exe); } } /** * Setup this component, passing the tag names to be tidied. */ @SuppressWarnings("unchecked") public void setup(SourceResolver resolver, Map objectModel, String src, Parameters par) throws ProcessingException, SAXException, IOException { super.setup(resolver, objectModel, src, par); String tagsParam = par.getParameter("tags", ""); if (log.isDebugEnabled()) { log.debug("tags: " + tagsParam); } this.tags = new HashMap<String, String>(); StringTokenizer tokenizer = new StringTokenizer(tagsParam, ","); while (tokenizer.hasMoreElements()) { String tok = tokenizer.nextToken().trim(); this.tags.put(tok, tok); } // sets the encoding if possible try { this.defaultEncoding = par.getParameter(PARAM_NAME_ENCODING); } catch (Exception exe) { } if (!this.properties.containsKey(OutputKeys.ENCODING) && this.defaultEncoding != null) { this.properties.put(OutputKeys.ENCODING, this.defaultEncoding); } String configUrl = this.properties.getProperty("jtidy-config"); loadJTidyProperties(configUrl, resolver); } /** * Set the default encoding. This will be overided if an encoding parameter is set. * This is mainly useful together with Spring * bean inheritance. * * @param defaultEncoding the defaultEncoding to set * @see de.juwimm.cocoon.components.transformation.TidyHTMLTransformer#encoding */ public void setDefaultEncoding(String defaultEncoding) { this.defaultEncoding = defaultEncoding; } /** * @see org.apache.avalon.framework.service.Serviceable#service(ServiceManager) * @deprecated use property injection instead */ public void service(ServiceManager manager) throws ServiceException { super.service(manager); final Settings settings = (Settings) manager.lookup(Settings.ROLE); String defaultEncoding = settings.getFormEncoding(); if (defaultEncoding != null) { this.defaultEncoding = defaultEncoding; } this.properties.setProperty(OutputKeys.ENCODING, this.defaultEncoding); manager.release(settings); } /** * This is mainly useful together with Spring bean inheritance. * * @param properties the properties to set */ public void setProperties(Properties properties) { this.properties = properties; } /** * Loads and sets the JTidy properties. * * @param configUrl the URL to the properties (e.g. "resource://jtidy.properties") */ private void loadJTidyProperties(String configUrl, SourceResolver resolver) { try { if (configUrl != null) { Source configSource = null; try { configSource = resolver.resolveURI(configUrl); if (log.isDebugEnabled()) { log.debug("Loading configuration from " + configSource.getURI()); } this.properties.load(configSource.getInputStream()); } catch (Exception exe) { log.warn("Cannot load configuration from " + configUrl); throw new ConfigurationException("Cannot load configuration from " + configUrl, exe); } finally { if (configSource != null) { resolver.release(configSource); configSource = null; } } } } catch (Exception exe) { log.warn(exe.getMessage(), exe); } } }