/******************************************************************************* * ALMA - Atacama Large Millimeter Array * Copyright (c) ESO - European Southern Observatory, 2011 * (in the framework of the ALMA collaboration). * All rights reserved. * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA *******************************************************************************/ /* * @@COPYRIGHT@@ */ package com.cosylab.util; /** * A Unix-like wildchar matcher. Supported wild-characters: '', '?'; sets: * [a-z], '!' negation Examples: '[a-g]li?n' matches 'florian' '[!abc]e' * matches 'smile' '[-z] matches 'a' Rules for sets: RegEx definition of the * valid set is: [!]?(-.)?((.-.)|(.))(.-)? a-z : match any letter between 'a' * and 'z' inclusively [-a : match everything up to and including 'a' (only * valid at beginning) a-] : match everything from 'a' (only valid at the end) * a : match exactly 'a' !a : not operator, match everything except 'a' * (only allowed at beginning) \a : treat a literally (useful for specifying * '!]-' in sets. Note that \t\b\n... are not processed. Wildchar rules: * : match any number (0..inf) number of occurences of any character ? : * match exactly and only one occurence of any character ab : match exactly * 'ab' [..]: same as , but character must match the set. * * @author <a href="mailto:ales.pucelj@cosylab.com">Ales Pucelj</a> * @version $id$ */ public class WildcharMatcher { private static final boolean DEBUG = false; /** Value of initial state */ private static final int INITIAL = 0; /** Value of final state */ private static final int FINAL = 2; /** Value of error state */ private static final int ERROR = 99; /** Any character (except control, unless escaped) */ private static final int TOKEN_CHAR = 0; /** Token for end of set: ] */ private static final int TOKEN_END = 1; /** Token for negation: */ private static final int TOKEN_NOT = 2; /** Token for range specification: - */ private static final int TOKEN_MINUS = 3; /** * Transition table holds the nextState used in set parsing. Rows define * states, columns define tokens. transitions[1][3] = 5 means: if in state * 1 next token is 3, goto state 5 */ private static final int[][] TRANSITIONS = { { 1, FINAL, 3, 4 }, { 1, FINAL, ERROR, 5 }, { ERROR, ERROR, ERROR, ERROR }, { 1, FINAL, ERROR, 4 }, { 6, ERROR, ERROR, ERROR }, { 6, FINAL, ERROR, ERROR }, { 1, FINAL, ERROR, ERROR } }; private static int getToken(final char ch) { switch (ch) { case ']': return TOKEN_END; case '!': return TOKEN_NOT; case '-': return TOKEN_MINUS; default: return TOKEN_CHAR; } } /** * DFA for parsing set strings. DFA was obtained from JFlex using the rule * : macro: CHAR = [^-\]\!] (everything except ], ! and - rule : * [!]?(-{CHAR})?(({CHAR}-{CHAR})|({CHAR}))({CHAR}-)?\] Result of * optimized NDFA is Character classes: class 0: [0-' * ']['"'-',']['.'-'\']['^'-65535] class 1: [']'] class 2: ['!'] class * 3: ['-'] Transition graph (for class goto state) State 0: 0 -> 1, 1 -> * 2, 2 -> 3, 3 -> 4 State 1: 0 -> 1, 1 -> 2, 3 -> 5 State [FINAL] State * 3: 0 -> 1, 1 -> 2, 3 -> 4 State 4: 0 -> 6 State 5: 0 -> 6, 1 -> 2 State * 6: 0 -> 1, 1 -> 2 * * @param pattern DOCUMENT ME! * @param offset DOCUMENT ME! * @param ch DOCUMENT ME! * * @return DOCUMENT ME! */ public static boolean testSet(final String pattern, int offset, final char ch) { final int n = pattern.length(); int state = INITIAL; int nextToken = ' '; char nextChar = ' '; char ch1 = ' '; boolean found = false; boolean negate = false; while (!found) { // Check for offset in case of final state, which is over the limit, // if ] is at the end of the string. if (offset < n) { nextChar = pattern.charAt(offset); if (nextChar == '\\') { // Any escaped sequence is two characters, otherwise error will // be throws, since this is an invalid sequence anyway nextChar = pattern.charAt(offset + 1); nextToken = TOKEN_CHAR; offset++; } else { nextToken = getToken(nextChar); } } switch (state) { case INITIAL: if (nextToken == TOKEN_NOT) { negate = true; break; } // No break, states 0, 1, 3, 6 have same next condition. case 1: if (nextToken == TOKEN_END) { return true; } case 3: case 6: if (nextToken == TOKEN_CHAR) { found = (ch == nextChar); ch1 = nextChar; } break; case 4: // condition [-a... found = (ch <= nextChar); break; case 5: if (nextToken == TOKEN_CHAR) { // condition ...a-z... found = ((ch >= ch1) && (ch <= nextChar)); } if (nextToken == TOKEN_END) { // condition ...a-] found = (ch >= ch1); } break; default:} if (DEBUG) { System.out.println("( " + state + " -> " + TRANSITIONS[state][nextToken] + " ) token = " + nextToken + " char = " + nextChar + ", found = " + found + ", negate = " + negate); } // Lookup next state in transition table and check for valid pattern state = TRANSITIONS[state][nextToken]; if (state == ERROR) { return false; // don't bother, this is a no match anyway // throw new RuntimeException("Invalid pattern"); } if (state == FINAL) { return found ^ negate; } offset++; } return found ^ negate; } /** * Recursive method for parsing the string. To avoid copying the strings, * the method accepts offset indices into both parameters. * * @param pattern Pattern used in parsing * @param ofp Offset into pattern string (ofp > 0) * @param str String to test * @param ofs Offset into test string (ofs > 0); * * @return boolean Do the strings match */ public static boolean parse(final String pattern, final int ofp, final String str, final int ofs) { final int lp = pattern.length(); final int ls = str.length(); // index into pattern string int ip = ofp; // index into test string; int is = ofs; char chp; char chs; if (DEBUG) { if ((ip > -1) && (is > -1) && (ip < lp) && (is < ls)) { System.out.println("parse: " + pattern.substring(ip) + " " + str.substring(is)); } } // Match happens only, if we parse both strings exactly to the end while ((ip < lp)) { chp = pattern.charAt(ip); if (DEBUG) { if ((ip > -1) && (is > -1) && (ip < lp) && (is < ls)) { System.out.println(pattern.substring(ip) + " " + str.substring(is)); } } switch (chp) { case '[': // System.out.println("[ "+chp+", "+chs); // Each set must be close with a ], otherwise it is invalid. int end = pattern.indexOf("]", ip); if (end == -1) { return false; } // Is this set followed by a * boolean isWildchar = ((end + 1) < lp) && (pattern.charAt(end + 1) == '*'); if (is < ls) { chs = str.charAt(is); } else { return parse(pattern, end + 2, str, is); } // Does this character match boolean thisChar = testSet(pattern, ip + 1, chs); // Check for single character match only if there is no // * at the end. if (!thisChar && !isWildchar) { // Return only if this character does not match return false; } if (isWildchar) { // If this character does not match, maybe this set // can be skipped entirely if (!thisChar) { ip = end + 2; break; } // Special case when this character matches, although // it should not: a[a-z]*z == az if (parse(pattern, end + 2, str, is)) { return true; } // Try to match next character if (parse(pattern, ip, str, is + 1)) { return true; } } // Single character matched, set was processed, since // no * was at the end. ip = end + 1; is++; break; case '?': // Obvious ip++; is++; break; case '*': // Trailing asterisk means that string matches till the end. // Also, checks if this is last char in the string if (ip + 1 == lp) { return true; } // Skip the * do { ip++; chp = pattern.charAt(ip); } while ((ip + 1 < lp) && (chp == '*')); // But perform a special check and solve it by recursing // from new position if (chp == '?') { if (parse(pattern, ip, str, is)) { return true; } } // Iterate through all possible matches in the test string int i = is; while (i < ls) { /* * Would be nice to skip unmatchable characters, * but it's too much fuss while ((i < ls) && (str.charAt(i) != chp)) { i++; if (i == ls) { return false; } } */ // Stupid brute force, but isn't as bad as it seems. // Try all possible matches in the test string. if (parse(pattern, ip, str, i)) { return true; } i++; } break; default: // Literal match if (is == ls || pattern.charAt(ip) != str.charAt(is)) { return false; } ip++; is++; } } // There could be several * at the end of the pattern, although the // test string is at the end. while ((ip < lp) && ((pattern.charAt(ip)) == '*')) { ip++; } // Same condition as with while loop return (is == ls) && (ip == lp); } /** * DOCUMENT ME! * * @param pattern DOCUMENT ME! * @param str DOCUMENT ME! * * @return DOCUMENT ME! */ public static boolean match(final String pattern, final String str) { return parse(pattern, 0, str, 0); } /** * Run test applet. * * @param args command line parameters */ public static void main(String[] args) { System.out.println("[-az-]* == 01 abAZ : true = " + WildcharMatcher.match("[-aa-]*", "01 abAZ")); System.out.println("[\\!a\\-bc]* == !!!b-bb- : true = " + WildcharMatcher.match("[\\!a\\-bc]*", "!!!b-bb-")); System.out.println("*zz == zz : true = " + WildcharMatcher.match("*zz", "zz")); System.out.println("[abc]*zz == zz : true = " + WildcharMatcher.match("[abc]*zz", "zz")); System.out.println("[!abc]*a[def] == xyzbd : false = " + WildcharMatcher.match("[!abc]*a[def]", "xyzbd")); System.out.println("[!abc]*a[def] == xyzad : true = " + WildcharMatcher.match("[!abc]*a[def]", "xyzad")); System.out.println("[a-g]l*i?n == florian : true = " + WildcharMatcher.match("[a-g]l*i?n", "florian")); System.out.println("[!abc]*e == smile : true = " + WildcharMatcher.match("[!abc]*e", "smile")); System.out.println("[-z] == a : true = " + WildcharMatcher.match("[-z]", "a")); System.out.println("[] == '' : false = " + WildcharMatcher.match("[]", "")); System.out.println("[a-z]* == java : true = " + WildcharMatcher.match("[a-z]*", "java")); System.out.println("*.* == command.com : true = " + WildcharMatcher.match("*.*", "command.com")); System.out.println("*.* == /var/etc : false = " + WildcharMatcher.match("*.*", "/var/etc")); System.out.println("**?*x*[abh-]*Q == XYZxabbauuZQ : true = " + WildcharMatcher.match("**?*x*[abh-]*Q", "XYZxabbauuZQ")); } /** * Translate a simple wildcard string into a regular expression. * <P> * Saying simple wildcard I mean a wildcard that uses only '*' and '?' * as control characters. * <P> * Chars that are control chars for regular expression are escaped * (i.e if for example the wildcard to translate contains a '[' it is * escaped "\[" in the translated string). * * @param wildcard The "simple" wildcard string to translate into a regular expression * @return The regular expression produced from the "simple" wildcrad */ public static String simpleWildcardToRegex(String wildcard) { StringBuffer s = new StringBuffer(wildcard.length()); s.append('^'); for (int i = 0, is = wildcard.length(); i < is; i++) { char c = wildcard.charAt(i); switch(c) { case '*': s.append(".*"); break; case '?': s.append("."); break; // escape special regexp-characters case '(': case ')': case '[': case ']': case '$': case '^': case '.': case '{': case '}': case '|': case '\\': s.append("\\"); s.append(c); break; default: s.append(c); break; } } s.append('$'); return(s.toString()); } } /* __oOo__ */