/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.cocoon.transformation; import java.io.IOException; import java.util.HashMap; import java.util.HashSet; import java.util.Map; import java.util.Set; import java.util.StringTokenizer; import org.apache.avalon.framework.activity.Disposable; import org.apache.avalon.framework.activity.Initializable; import org.apache.avalon.framework.configuration.Configuration; import org.apache.avalon.framework.configuration.ConfigurationException; import org.apache.avalon.framework.parameters.ParameterException; import org.apache.avalon.framework.parameters.Parameters; import org.apache.cocoon.ProcessingException; import org.apache.cocoon.components.modules.input.InputModuleHelper; import org.apache.cocoon.environment.SourceResolver; import org.apache.cocoon.transformation.helpers.VariableConfiguration; import org.apache.regexp.RE; import org.apache.regexp.RECompiler; import org.apache.regexp.REProgram; import org.apache.regexp.RESyntaxException; import org.xml.sax.Attributes; import org.xml.sax.SAXException; import org.xml.sax.helpers.AttributesImpl; /** * Rewrites URIs in links to a value determined by an InputModule. * The URI scheme identifies the InputModule to use, and the rest of the URI is * used as the attribute name. * * <h3>Example</h3> * <p>For instance, if we had an {@link * org.apache.cocoon.components.modules.input.XMLFileModule}, configured to * read values from an XML file: * <pre> * <site> * <faq> * <how_to_boil_eggs href="faq/eggs.html"/> * </faq> * </site> * </pre> * * mapped to the prefix 'site:', then <code><link * href="site:/site/faq/how_to_boil_eggs/@href"></code> would be replaced * with <code><link href="faq/eggs.html"></code> * * <h3>InputModule Configuration</h3> * <p>InputModules are configured twice; first statically in * <code>cocoon.xconf</code>, and then dynamically at runtime, with dynamic * configuration (if any) taking precedence. Transformer allows * you to pass a dynamic configuration to used InputModules as follows. * * <p>First, a template Configuration is specified in the static * <map:components> block of the sitemap within <input-module> tags: * <pre> * <map:transformer name="linkrewriter" * src="org.apache.cocoon.transformation.LinkRewriterTransformer"> * <link-attrs>href src</link-attrs> * <schemes>site ext</schemes> * <input-module name="site"> * <file src="cocoon://samples/link/linkmap" reloadable="true"/> * </input-module> * <input-module name="mapper"> * <input-module name="site"> * <file src="{src}" reloadable="true"/> * </input-module> * <prefix>/site/</prefix> * <suffix>/@href</suffix> * </input-module> * </map:transformer> * </pre> * * Here, we have first configured which attributes to examine, and which URL * schemes to consider rewriting. In this example, <a href="site:index"> * would be processed. See below for more configuration options. * * <p>Then, we have established dynamic configuration templates for two modules, * 'site' (an {@link org.apache.cocoon.components.modules.input.XMLFileModule} * and 'mapper' (A {@link * org.apache.cocoon.components.modules.input.SimpleMappingMetaModule}. All * other InputModules will use their static configs. Note that, when * configuring a meta InputModule like 'mapper', we need to also configure the * 'inner' module (here, 'site') with a nested <input-module>. * * <p>There is one further twist; to have <em>really</em> dynamic configuration, * we need information available only when the transformer actually runs. This * is why the above config was called a "template" configuration; it needs to * be 'instantiated' and provided extra info, namely: * <ul> * <li>The {src} string will be replaced with the map:transform @src attribute value. * <li>Any other {variables} will be replaced with map:parameter values * </ul> * * With the above config template, we can have a matcher like: * * <pre> * <map:match pattern="**welcome"> * <map:generate src="index.xml"/> * <map:transform type="linkrewriter" src="cocoon:/{1}linkmap"/> * <map:serialize type="xml"/> * </map:match> * </pre> * * Which would cause the 'mapper' XMLFileModule to be configured with a * different XML file, depending on the request. * * <p>Similarly, we could use a dynamic prefix: * <pre> * <prefix>{prefix}</prefix> * </pre> * in the template config, and: * <pre> * <map:parameter name="prefix" value="/site/"/> * </pre> * in the map:transform * * <p>A live example of LinkRewriterTransformer can be found in the <a * href="http://forrest.apache.org/">Apache Forrest</a> sitemap. * * <h3>Transformer Configuration</h3> * <p> * The following configuration entries in map:transformer block are recognised: * <dl> * <dt>link-attrs</dt> * <dd>Space-separated list of attributes to consider links (to be * transformed). The whole value of the attribute is considered link and * transformed.</dd> * * <dt>link-attr</dt> * <dd>0..n of these elements each specify an attribute containing link(s) * (to be transformed) and optionally a regular expression to locate * substring(s) of the attribute value considered link(s). Has two * attributes: * <dl> * <dt>name</dt> * <dd>(required) name of the attribute whose value contains link(s).</dd> * <dt>pattern</dt> * <dd>(optional) regular expression such that when matched against the * attribute value, all parenthesized expressions (except number 0) will * be considered links that should be transformed. If absent, the whole value * of the attribute is considered to be a link, as if the attribute was * included in 'link-attrs'.</dd> * </dl> * </dd> * * <dt>schemes</dt> * <dd>Space-separated list of URI schemes to explicitly include. * If specified, all URIs with unlisted schemes will <i>not</i> be converted.</dd> * * <dt>exclude-schemes</dt> * <dd>Space-separated list of URI schemes to explicitly exclude. * Defaults to 'http https ftp news mailto'.</dd> * * <dt>bad-link-str</dt> * <dd>String to use for links with a correct InputModule prefix, but no value * therein. Defaults to the original URI.</dd> * * <dt>namespace-uri</dt> * <dd>The namespace uri of elements whose attributes are considered for * transformation. Defaults to the empty namespace ("").</dd> * </dl> * * <p> * The attributes considered to contain links are a <em>set</em> of the attributes * specified in 'link-attrs' element and all 'link-attr' elements. Each attribute * should be specified only once either in 'link-attrs' or 'link-attr'; i.e. an * attribute can have at most 1 regular expression associated with it. If neither * 'link-attrs' nor 'link-attr' configuration is present, defaults to 'href'. * * <p>Below is an example of regular expression usage that will transform links * <code>x1</code> and <code>x2</code> in * <code><action target="foo url(x1) bar url(x2)"/></code>: * * <pre> * <map:transformer name="linkrewriter" * src="org.apache.cocoon.transformation.LinkRewriterTransformer"> * <link-attr name="target" pattern="(?:url\((.*?)\).*?){1,2}$"/> * <!-- additional configuration ... --> * </map:transformer> * </pre> * * <p> * When matched against the value of <code>target</code> attribute above, * the parenthesized expressions are:<br/> * <samp> * $0 = url(x1) bar url(x2)<br/> * $1 = x1<br/> * $2 = x2<br/> * </samp> * * <p> * Expression number 0 is always discarded by the transformer and the rest * are considered links and re-written. * * <p>If present, map:parameter's from the map:transform block override the * corresponding configuration entries from map:transformer. As an exception, * 'link-attr' parameters are not recognised; 'link-attrs' parameter overrides * both 'link-attrs' and 'link-attr' configuration. * * <p> * * @version $Id$ */ public class LinkRewriterTransformer extends AbstractSAXTransformer implements Initializable, Disposable { private final static String NAMESPACE = ""; /** * A guardian object denoting absense of regexp pattern for a given * attribute. Used as value in linkAttrs and origLinkAttrs maps. */ private final static Object NO_REGEXP = new Object(); // // Configure()'d parameters // /** Configuration passed to the component once through configure(). */ private Configuration origConf; private String origBadLinkStr; private String origInSchemes; private String origOutSchemes; private String origNamespaceURI; /** * A map where keys are those attributes which are considered 'links'. * Obtained from configuration passed to the component once through * the configure() method. * * <p>Map contains NO_REGEXP object for attributes whose whole values are * considered links, or compiled RE expressions for attributes whose values * might contain a link. */ private Map origLinkAttrs; // // Setup()'d parameters // /** * Derivation of origConf with variables obtained from setup() parameters. * Recreated once per invocation. */ private Configuration conf; /** * String to use for links with a correct InputModule prefix, but no value * therein. */ private String badLinkStr; /** Set containing schemes (protocols) of links to process */ private Set inSchemes; /** Set containing schemes (protocols) of links to exclude from processing */ private Set outSchemes; /** * A map of attributes considered 'links' and corresponding RE expression * or NO_REGEXP object. Recreated once per invocation or copied from * origLinkAttrs based on setup() method parameters. */ private Map linkAttrs; private InputModuleHelper modHelper; /** * Configure this component from the map:transformer block. Called before * initialization and setup. */ public void configure(Configuration conf) throws ConfigurationException { super.configure(conf); this.origConf = conf; this.origBadLinkStr = conf.getChild("bad-link-str").getValue(null); this.origInSchemes = conf.getChild("schemes").getValue(""); this.origOutSchemes = conf.getChild("exclude-schemes").getValue("http https ftp news mailto"); this.origNamespaceURI = conf.getChild("namespace-uri").getValue(NAMESPACE); /* * Setup origLinkAttrs map from the original Configuration: * 1. Parse link-attrs Configuration * 2. Process link-attr Children, warn if overwriting * 3. If no link-attrs, and no link-attr are available, defaults to "href" */ String linkAttrsValue = conf.getChild("link-attrs").getValue(""); this.origLinkAttrs = split(linkAttrsValue, " ", NO_REGEXP); Configuration[] attrConfs = conf.getChildren("link-attr"); if (attrConfs.length > 0) { RECompiler compiler = new RECompiler(); for (int i = 0; i < attrConfs.length; i++) { String attr = attrConfs[i].getAttribute("name"); if (getLogger().isWarnEnabled() && origLinkAttrs.containsKey(attr)) { getLogger().warn("Duplicate configuration entry found for attribute '" + attr + "', overwriting previous configuration"); } String pattern = attrConfs[i].getAttribute("pattern", null); if (pattern == null) { this.origLinkAttrs.put(attr, NO_REGEXP); } else { try { this.origLinkAttrs.put(attr, compiler.compile(pattern)); } catch (RESyntaxException e) { String msg = "Invalid regexp pattern '" + pattern + "' specified for attribute '" + attr + "'"; throw new ConfigurationException(msg, attrConfs[i], e); } } } } // If nothing configured, default to href attribute if (this.origLinkAttrs.size() == 0) { this.origLinkAttrs.put("href", NO_REGEXP); } } /** * Initiate resources prior to this component becoming active. */ public void initialize() throws Exception { this.modHelper = new InputModuleHelper(); this.modHelper.setup(this.manager); } /** * Setup this component to handle a map:transform instance. */ public void setup(SourceResolver resolver, Map objectModel, String src, Parameters parameters) throws ProcessingException, SAXException, IOException { super.setup(resolver, objectModel, src, parameters); this.badLinkStr = parameters.getParameter("bad-link-str", // per-request config this.origBadLinkStr); // else fall back to per-instance config this.namespaceURI = parameters.getParameter("namespace-uri", this.origNamespaceURI); this.inSchemes = split(parameters.getParameter("schemes", this.origInSchemes), " "); this.outSchemes = split(parameters.getParameter("exclude-schemes", this.origOutSchemes), " "); this.linkAttrs = this.origLinkAttrs; if (parameters.isParameter("link-attrs")) { try { this.linkAttrs = split(parameters.getParameter("link-attrs"), " ", NO_REGEXP); } catch (ParameterException ex) { // shouldn't happen } } if (getLogger().isDebugEnabled()) { getLogger().debug("bad-link-str = " + badLinkStr); getLogger().debug("link-attrs = " + linkAttrs); getLogger().debug("schemes = " + inSchemes); getLogger().debug("exclude-schemes = " + outSchemes); getLogger().debug("namespace-uri = " + namespaceURI); } // Generate conf VariableConfiguration varConf = new VariableConfiguration(this.origConf); varConf.addVariable("src", src); varConf.addVariables(parameters); try { this.conf = varConf.getConfiguration(); } catch (ConfigurationException ce) { throw new ProcessingException("Couldn't create dynamic config ", ce); } } /** Recycle this component for use in another map:transform. */ public void recycle() { // Note: configure() and initialize() are not called after every // recycle, so don't null origConf, origLinkAttrs, etc. this.conf = null; this.badLinkStr = null; this.linkAttrs = null; this.inSchemes = null; this.outSchemes = null; super.recycle(); } /** * Split a string into a Set of strings. * * @param str String to split * @param delim Delimiter character * @return A Set of strings in 'str' */ private Set split(String str, String delim) { if (str == null) { return null; } Set tokens = new HashSet(); StringTokenizer st = new StringTokenizer(str, delim); while (st.hasMoreTokens()) { tokens.add(st.nextToken()); } return tokens; } /** * Split a string and create a Map where keys are the tokens from the string. * * @param str String to split * @param delim Delimiter character * @param valueObj Object to insert in the Map (may be null) * @return A Map of strings in 'str' */ private Map split(String str, String delim, Object valueObj) { if (str == null) { return null; } // valueObj may be null, because HashMap permits null values Map schemes = new HashMap(); StringTokenizer st = new StringTokenizer(str, delim); while (st.hasMoreTokens()) { String pfx = st.nextToken(); if (schemes.containsKey(pfx) && getLogger().isWarnEnabled()) { getLogger().warn("Duplicate configuration entry found for attribute '" + pfx + "', overwriting previous configuration"); } schemes.put(pfx, valueObj); } return schemes; } /** * Start processing elements of our namespace. * This hook is invoked for each sax event with our namespace. * @param uri The namespace of the element. * @param name The local name of the element. * @param raw The qualified name of the element. * @param attr The attributes of the element. */ public void startTransformingElement(String uri, String name, String raw, Attributes attr) throws ProcessingException, IOException, SAXException { boolean matched = false; for (int attrIdx = 0; attrIdx < attr.getLength(); attrIdx++) { String attrName = attr.getQName(attrIdx); String attrValue = createTransformedAttr(attrName, attr.getValue(attrIdx)); if (attrValue != null) { if (!matched) { attr = new AttributesImpl(attr); matched = true; } ((AttributesImpl) attr).setValue(attrIdx, attrValue); } } super.startTransformingElement(uri, name, raw, attr); } /** * Rewrite set of links in an attribute. * * @param attrName QName of the attribute containing unconverted link(s). * @param oldAttrValue value of the attribute containing unconverted link(s). * @return new value of the attribute based on <code>oldAttrValue</code>, but with link(s) rewritten. If not * modified, returns null (for example, if attribute not found in <code>linkAttrs</code> or not matched to * regexp pattern). */ private String createTransformedAttr( String attrName, String oldAttrValue) { if (!linkAttrs.containsKey(attrName)) { return null; } String newAttrValue = null; Object reProgram = linkAttrs.get(attrName); if (reProgram == NO_REGEXP) { newAttrValue = createTransformedLink(oldAttrValue); } else { // must be instanceof REProgram RE r = new RE((REProgram) reProgram); if (r.match(oldAttrValue)) { StringBuffer bufOut = new StringBuffer(oldAttrValue); int offset = 0; String link = null; String newLink = null; boolean modified = false; // skip the first paren for (int i = 1; i < r.getParenCount(); i++) { link = r.getParen(i); newLink = createTransformedLink(link); if (newLink != null) { bufOut.replace(r.getParenStart(i) + offset, r.getParenEnd(i) + offset, newLink); offset += newLink.length() - r.getParenLength(i); modified = true; } } if (modified) { newAttrValue = bufOut.toString(); } } } return newAttrValue; } /** * Rewrite a link - use InputModule to obtain new value for the link based on <code>oldLink</code>. * * @param oldLink value of the unconverted link. * @return new value of the link. If not modified, returns null (for example, if link scheme * is in <code>outSchemes</code>. */ private String createTransformedLink(String oldLink) { String newLink = null; int i = oldLink.indexOf(":"); if (i != -1) { String scheme = oldLink.substring(0, i); String addr = oldLink.substring(i + 1); if (outSchemes.contains(scheme)) { if (getLogger().isDebugEnabled()) { getLogger().debug("Ignoring link '" + oldLink + "'"); } } else if (inSchemes.contains(scheme) || inSchemes.size() == 0) { // If the link wasn't deliberately excluded from a // list of 'good' links, then include it. try { newLink = (String) modHelper.getAttribute(this.objectModel, getConf(scheme), scheme, addr, (badLinkStr != null? badLinkStr: scheme + ":" + addr)); if (getLogger().isDebugEnabled()) { getLogger().debug("Converted link '" + oldLink + "' to '" + newLink + "'"); } } catch (org.apache.avalon.framework.CascadingRuntimeException e) { // Rethrow Configuration errors if (e.getCause() instanceof ConfigurationException) { throw e; } // Swallow IM errors, usually prefixes like 'telnet' that aren't // bound to an InputModule. These should really be declared in // 'exclude-schemes', hence the 'error' classification of this log. if (getLogger().isErrorEnabled()) { getLogger().error("Error rewriting link '" + oldLink + "': " + e.getMessage()); } } } } return newLink; } /** * Retrieve a dynamic configuration for a specific InputModule. * * @param scheme InputModule name * @return Configuration for specified scheme, from the map:transformer block. */ private Configuration getConf(String scheme) { Configuration[] schemeConfs = this.conf.getChildren("input-module"); for (int i = 0; i < schemeConfs.length; i++) { if (scheme.equals(schemeConfs[i].getAttribute("name", null))) { return schemeConfs[i]; } } return null; } /* (non-Javadoc) * @see org.apache.avalon.framework.activity.Disposable#dispose() */ public void dispose() { if (this.modHelper != null) { this.modHelper.releaseAll(); this.modHelper = null; } super.dispose(); } }