/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.cocoon.util; import org.apache.regexp.RE; import org.apache.regexp.RECompiler; import org.apache.regexp.REProgram; import java.util.HashMap; import java.util.Map; /** * This class is an utility class that perform wildcard-patterns matching and isolation. * * @version $Id$ */ public class WildcardMatcherHelper { //~ Static fields/initializers ----------------------------------------------------------------- /** Default path separator: "/" */ public static final char ESC = '\\'; /** Default path separator: "/" */ public static final char PATHSEP = '/'; /** Default path separator: "/" */ public static final char STAR = '*'; //~ Methods ------------------------------------------------------------------------------------ /** * Match a pattern agains a string and isolates wildcard replacement into a <code>Map</code>. * <br> * Here is how the matching algorithm works: * * <ul> * <li> * The '*' character, meaning that zero or more characters (excluding the path separator '/') * are to be matched. * </li> * <li> * The '**' sequence, meaning that zero or more characters (including the path separator '/') * are to be matched. * </li> * <li> * The '\*' sequence is honored as a litteral '*' character, not a wildcard * </li> * </ul> * <br> * When more than two '*' characters, not separated by another character, are found their value is * considered as '**' and immediate succeeding '*' are skipped. * <br> * The '**' wildcard is greedy and thus the following sample matches as {"foo/bar","baz","bug"}: * <dl> * <dt>pattern</dt> * <dd>STAR,STAR,PATHSEP,STAR,PATHSEP,STAR,STAR (why can't I express it litterally?)</dt> * <dt>string</dt> * <dd>foo/bar/baz/bug</dt> * </dl> * The first '**' in the pattern will suck up as much as possible without making the match fail. * * @param pat The pattern string. * @param str The string to math agains the pattern * * @return a <code>Map</code> containing the representation of the extracted pattern. The extracted patterns are * keys in the <code>Map</code> from left to right beginning with "1" for te left most, "2" for the next, * a.s.o. The key "0" is the string itself. If the return value is null, string does not match to the * pattern . */ public static Map match(final String pat, final String str) { Matcher matcher; synchronized (cache) { matcher = (Matcher) cache.get(pat); if ( matcher == null ) { matcher = new Matcher(pat); cache.put(pat, matcher); } } String[] list = matcher.getMatches(str); if ( list == null ) return null; int n = list.length; Map map = new HashMap(n * 2 + 1); for ( int i = 0; i < n; i++ ) { map.put(String.valueOf(i), list[i]); } return map; } /** Cache for compiled pattern matchers */ private static final Map cache = new HashMap(); //~ Inner Classes ------------------------------------------------------------------------------ /** * The private matcher class */ private static class Matcher { /** Regexp to split constant parts from front and back leaving wildcards in the middle. */ private static final REProgram splitter; static { final String fixedRE = "([^*\\\\]*)"; final String wcardRE = "(.*[*\\\\])"; final String splitRE = "^" + fixedRE + wcardRE + fixedRE + "$"; splitter = new RECompiler().compile(splitRE); } /** Wildcard types to short-cut simple '*' and "**' matches. */ private static final int WC_CONST = 0; private static final int WC_STAR = 1; private static final int WC_STARSTAR = 2; private static final int WC_REGEXP = 3; //~ Instance fields ------------------------------------------------------------------------ // All fields declared final to emphasize requirement to be thread-safe. /** Fixed text at start of pattern. */ private final String prefix; /** Fixed text at end of pattern. */ private final String suffix; /** Length of prefix and suffix. */ private final int fixlen; /** Wildcard type of pattern. */ private final int wctype; /** Compiled regexp equivalent to wildcard pattern between prefix and suffix. */ private final REProgram regexp; //~ Constructors --------------------------------------------------------------------------- /** * Creates a new Matcher object. * * @param pat The pattern * @param str The string */ Matcher(final String pat) { RE re = new RE(splitter); if ( re.match(pat) ) { // Split pattern into (foo/)(*)(/bar). prefix = re.getParen(1); String wildcard = re.getParen(2); String tail = re.getParen(3); // If wildcard ends with \ then add the first char of postfix to wildcard. if ( tail.length() != 0 && wildcard.charAt(wildcard.length() - 1) == ESC ) { wildcard = wildcard + tail.substring(0, 1); suffix = tail.substring(1); } else { suffix = tail; } // Use short-cuts for single * or ** wildcards if ( wildcard.equals("*") ) { wctype = WC_STAR; regexp = null; } else if ( wildcard.equals("**") ) { wctype = WC_STARSTAR; regexp = null; } else { wctype = WC_REGEXP; regexp = compileRegexp(wildcard); } } else { // Pattern is a constant without '*' or '\'. prefix = pat; suffix = ""; wctype = WC_CONST; regexp = null; } fixlen = prefix.length() + suffix.length(); } //~ Methods -------------------------------------------------------------------------------- /** * Match string against pattern. * * @param str The string * @return list of wildcard matches, null if match failed */ String[] getMatches(final String str) { // Protect against 'foo' matching 'foo*foo'. if ( str.length() < fixlen ) return null; if ( !str.startsWith(prefix) ) return null; if ( !str.endsWith(suffix) ) return null; String infix = str.substring(prefix.length(), str.length() - suffix.length()); if ( wctype == WC_REGEXP ) { RE re = new RE(regexp); if ( !re.match(infix) ) return null; int n = re.getParenCount(); String[] list = new String[n]; list[0] = str; for ( int i = 1; i < n; i++ ) list[i] = re.getParen(i); return list; } if ( wctype == WC_CONST ) { if ( infix.length() != 0 ) return null; return new String[] { str }; } if ( wctype == WC_STAR ) { if ( infix.indexOf(PATHSEP) != -1 ) return null; } return new String[] { str, infix }; } } /** * Compile wildcard pattern into regexp pattern. * * @param pat The wildcard pattern * @return compiled regexp program. */ private static REProgram compileRegexp(String pat) { StringBuffer repat = new StringBuffer(pat.length() * 6); repat.append('^'); // Add an extra character to allow unchecked wcpat[i+1] accesses. // Unterminated ESC sequences are silently handled as '\\'. char[] wcpat = (pat + ESC).toCharArray(); for ( int i = 0, n = pat.length(); i < n; i++ ) { char ch = wcpat[i]; if ( ch == STAR ) { if ( wcpat[i + 1] != STAR ) { repat.append("([^/]*)"); continue; } // Handle two and more '*' as single '**'. while ( wcpat[i + 1] == STAR ) i++; repat.append("(.*)"); continue; } // Match ESC+ESC and ESC+STAR as literal ESC and STAR which needs to be escaped // in regexp. Match ESC+other as two characters ESC+other where other may also // need to be escaped in regexp. if ( ch == ESC ) { ch = wcpat[++i]; if ( ch != ESC && ch != STAR ) repat.append("\\\\"); } if ( ch >= 'a' && ch <= 'z' || ch >= 'A' && ch <= 'Z' || ch >= '0' && ch <= '9' || ch == '/' ) { repat.append(ch); continue; } repat.append('\\'); repat.append(ch); } repat.append('$'); return new RECompiler().compile(repat.toString()); } }