/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.cocoon.transformation; import java.io.BufferedInputStream; import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.PrintWriter; import java.io.StringWriter; import java.util.HashMap; import java.util.Map; import java.util.Properties; import java.util.StringTokenizer; import org.apache.avalon.framework.configuration.Configurable; import org.apache.avalon.framework.configuration.Configuration; import org.apache.avalon.framework.configuration.ConfigurationException; import org.apache.avalon.framework.parameters.Parameters; import org.apache.cocoon.ProcessingException; import org.apache.cocoon.environment.SourceResolver; import org.apache.cocoon.transformation.AbstractSAXTransformer; import org.apache.cocoon.xml.XMLUtils; import org.apache.cocoon.xml.IncludeXMLConsumer; import org.apache.excalibur.source.Source; import org.w3c.tidy.Tidy; import org.xml.sax.Attributes; import org.xml.sax.SAXException; /** * Converts (escaped) HTML snippets into JTidied HTML. * This transformer expects a list of elements, passed as comma separated * values of the "tags" parameter. It records the text enclosed in such * elements and pass it thru JTidy to obtain valid XHTML. * * <p>TODO: Add namespace support. * <p><strong>WARNING:</strong> This transformer should be considered unstable. * * @author <a href="mailto:d.madama@pro-netics.com">Daniele Madama</a> * @author <a href="mailto:gianugo@apache.org">Gianugo Rabellino</a> * * @version CVS $Id$ */ public class HTMLTransformer extends AbstractSAXTransformer implements Configurable { /** * Properties for Tidy format */ private Properties properties; /** * Tags that must be normalized */ private Map tags; /** * React on endElement calls that contain a tag to be * tidied and run Jtidy on it, otherwise passthru. * * @see org.xml.sax.ContentHandler#endElement(java.lang.String, java.lang.String, java.lang.String) */ public void endElement(String uri, String name, String raw) throws SAXException { if (this.tags.containsKey(name)) { String toBeNormalized = this.endTextRecording(); try { this.normalize(toBeNormalized); } catch (ProcessingException e) { e.printStackTrace(); } } super.endElement(uri, name, raw); } /** * Start buffering text if inside a tag to be normalized, * passthru otherwise. * * @see org.xml.sax.ContentHandler#startElement(java.lang.String, java.lang.String, java.lang.String, org.xml.sax.Attributes) */ public void startElement( String uri, String name, String raw, Attributes attr) throws SAXException { super.startElement(uri, name, raw, attr); if (this.tags.containsKey(name)) { this.startTextRecording(); } } /** * Configure this transformer, possibly passing to it * a jtidy configuration file location. */ public void configure(Configuration config) throws ConfigurationException { super.configure(config); String configUrl = config.getChild("jtidy-config").getValue(null); if (configUrl != null) { org.apache.excalibur.source.SourceResolver resolver = null; Source configSource = null; try { resolver = (org.apache.excalibur.source.SourceResolver) this.manager.lookup(org.apache.excalibur.source.SourceResolver.ROLE); configSource = resolver.resolveURI(configUrl); if (getLogger().isDebugEnabled()) { getLogger().debug( "Loading configuration from " + configSource.getURI()); } this.properties = new Properties(); this.properties.load(configSource.getInputStream()); } catch (Exception e) { getLogger().warn("Cannot load configuration from " + configUrl); throw new ConfigurationException( "Cannot load configuration from " + configUrl, e); } finally { if (null != resolver) { this.manager.release(resolver); resolver.release(configSource); } } } } /** * The beef: run JTidy on the buffered text and stream * the result * * @param text the string to be tidied */ private void normalize(String text) throws ProcessingException { try { // Setup an instance of Tidy. Tidy tidy = new Tidy(); tidy.setXmlOut(true); if (this.properties == null) { tidy.setXHTML(true); } else { tidy.setConfigurationFromProps(this.properties); } //Set Jtidy warnings on-off tidy.setShowWarnings(getLogger().isWarnEnabled()); //Set Jtidy final result summary on-off tidy.setQuiet(!getLogger().isInfoEnabled()); //Set Jtidy infos to a String (will be logged) instead of System.out StringWriter stringWriter = new StringWriter(); PrintWriter errorWriter = new PrintWriter(stringWriter); tidy.setErrout(errorWriter); // Extract the document using JTidy and stream it. ByteArrayInputStream bais = new ByteArrayInputStream(text.getBytes("UTF-8")); org.w3c.dom.Document doc = tidy.parseDOM(new BufferedInputStream(bais), null); // FIXME: Jtidy doesn't warn or strip duplicate attributes in same // tag; stripping. XMLUtils.stripDuplicateAttributes(doc, null); errorWriter.flush(); errorWriter.close(); if (getLogger().isWarnEnabled()) { getLogger().warn(stringWriter.toString()); } IncludeXMLConsumer.includeNode(doc, this.contentHandler, this.lexicalHandler); } catch (Exception e) { throw new ProcessingException( "Exception in HTMLTransformer.normalize()", e); } } /** * Setup this component, passing the tag names to be tidied. */ public void setup( SourceResolver resolver, Map objectModel, String src, Parameters par) throws ProcessingException, SAXException, IOException { super.setup(resolver, objectModel, src, par); String tagsParam = par.getParameter("tags", ""); if (getLogger().isDebugEnabled()) { getLogger().debug("tags: " + tagsParam); } this.tags = new HashMap(); StringTokenizer tokenizer = new StringTokenizer(tagsParam, ","); while (tokenizer.hasMoreElements()) { String tok = tokenizer.nextToken().trim(); this.tags.put(tok, tok); } } }