/*
* This code has been adapted from the jakarta ORO package. The original license
* follows below:
*
* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2000 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation", "Jakarta-Oro"
* must not be used to endorse or promote products derived from this
* software without prior written permission. For written
* permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache"
* or "Jakarta-Oro", nor may "Apache" or "Jakarta-Oro" appear in their
* name, without prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*
* $Id$
*/
package org.exist.util;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* Translates a glob expression into a Java regular expression.
*
* The following syntax is supported for glob expressions:
*
* <ul>
* <li> <b>*</b> - Matches zero or more instances of any character. If the
* STAR_CANNOT_MATCH_NULL_MASK option is used, <b>*</b> matches
* one or more instances of any character.
* <li> <b>?</b> - Matches one instance of any character. If the
* QUESTION_MATCHES_ZERO_OR_ONE_MASK option is used, <b>?</b>
* matches zero or one instances of any character.
* <li> <b>[...]</b> - Matches any of characters enclosed by the brackets.
* <b> * </b> and <b>?</b> lose their special meanings within a
* character class. Additionaly if the first character following
* the opening bracket is a <b>!</b> or a <b>^</b>, then any
* character not in the character class is matched. A <b>-</b>
* between two characters can be used to denote a range. A
* <b>-</b> at the beginning or end of the character class matches
* itself rather than referring to a range. A <b>]</b> immediately
* following the opening <b>[</b> matches itself rather than
* indicating the end of the character class, otherwise it must be
* escaped with a backslash to refer to itself.
* <li> <b>\</b> - A backslash matches itself in most situations. But
* when a special character such as a <b>*</b> follows it, a
* backslash <em> escapes </em> the character, indicating that
* the special chracter should be interpreted as a normal character
* instead of its special meaning.
* <li> All other characters match themselves.
* </ul>
*
* <p>Please remember that the when you construct a Java string in Java code,
* the backslash character is itself a special Java character, and it must
* be double backslashed to represent single backslash in a regular
* expression.</p>
*
* <p>The original code is adapted from the jakarta ORO package.</p>
*
*/
public class GlobToRegex {
private static boolean __isPerl5MetaCharacter(char ch) {
return (ch == '*' || ch == '?' || ch == '+' || ch == '[' || ch == ']'
|| ch == '(' || ch == ')' || ch == '|' || ch == '^'
|| ch == '$' || ch == '.' || ch == '{' || ch == '}' || ch == '\\');
}
private static boolean __isGlobMetaCharacter(char ch) {
return (ch == '*' || ch == '?' || ch == '[' || ch == ']');
}
/**
* This static method is the basic engine of the Glob PatternCompiler
* implementation. It takes a glob expression in the form of a character
* array and converts it into a String representation of a Perl5 pattern.
* The method is made public so that programmers may use it for their own
* purposes. However, the GlobCompiler compile methods work by converting
* the glob pattern to a Perl5 pattern using this method, and then invoking
* the compile() method of an internally stored Perl5Compiler instance.
* <p>
*
* @param pattern
* A character array representation of a Glob pattern.
* @return A String representation of a Perl5 pattern equivalent to the Glob
* pattern.
*/
public static String globToRegexp(CharSequence pattern) {
boolean inCharSet, starCannotMatchNull = false, questionMatchesZero;
int ch;
StringBuilder buffer;
buffer = new StringBuilder(2 * pattern.length());
inCharSet = false;
char c;
for (ch = 0; ch < pattern.length(); ch++) {
c = pattern.charAt(ch);
switch (c) {
case '*':
if (inCharSet)
buffer.append('*');
else {
buffer.append(".*");
}
break;
case '?':
if (inCharSet)
buffer.append('?');
else {
buffer.append(".?");
}
break;
case '[':
inCharSet = true;
buffer.append(c);
if (ch + 1 < pattern.length()) {
switch (pattern.charAt(ch + 1)) {
case '!':
case '^':
buffer.append('^');
++ch;
continue;
case ']':
buffer.append(']');
++ch;
continue;
}
}
break;
case ']':
inCharSet = false;
buffer.append(c);
break;
case '\\':
buffer.append('\\');
if (ch == pattern.length() - 1) {
buffer.append('\\');
} else if (__isGlobMetaCharacter(pattern.charAt(ch + 1)))
buffer.append(pattern.charAt(++ch));
else
buffer.append('\\');
break;
default:
if (!inCharSet && __isPerl5MetaCharacter(c))
buffer.append('\\');
buffer.append(c);
break;
}
}
return buffer.toString();
}
/**
* @param args
*/
public static void main(String[] args) {
String glob = "[gG]enerate? ";
String re = globToRegexp(glob);
String str = "This is generated text";
Pattern pattern = Pattern.compile(re);
Matcher matcher = pattern.matcher(str);
boolean match = matcher.find();
System.out.println(str.substring(matcher.start(), matcher.end()));
}
}