/******************************************************************************* * Copyright (c) 2013 Arapiki Solutions Inc. * All rights reserved. This program and the accompanying materials * are made available under the terms of the Eclipse Public License v1.0 * which accompanies this distribution, and is available at * http://www.eclipse.org/legal/epl-v10.html * * Contributors: * psmith - initial API and * implementation and/or initial documentation *******************************************************************************/ package com.buildml.utils.regex; import java.util.ArrayList; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.regex.PatternSyntaxException; /** * A static utility class for handling regular expressions. * * @author Peter Smith <psmith@arapiki.com> */ public class BmlRegex { /*=====================================================================================* * PUBLIC STATIC METHODS *=====================================================================================*/ /** * Given a regular expression in Ant syntax, convert it into a Java regular expression * and return it as * @param antRegex The regular expression in Ant syntax (using *, ?, **) * @return The equivalent Java regular expression. */ public static String convertAntToJavaRegex(String antRegex) { /* * The rules for conversion are: * ** - replaced by ".*" - matches multiple path components. * * - replaced by "[^/]*" - matches multiple characters, but not across path components. * ? - replaced by "." - matches a single character. * . - replaced by \. - to stop it functioning as a wildcard. * [, $, ^, &, + are quoted with \ to stop them being treated as special. */ /* create an output StringBuffer with (hopefully) enough space for the translated string */ int len = antRegex.length(); StringBuffer sb = new StringBuffer((int) (len * 1.5)); /* pattern starts from beginning of string */ sb.append("^"); /* for each character in the input string, map from Ant syntax to Java syntax */ for (int i = 0; i != len; i++) { char thisCh = antRegex.charAt(i); char nextCh = '\0'; if ((i+1) != len) { nextCh = antRegex.charAt(i+1); } switch (thisCh) { case '*': if (nextCh == '*') { sb.append(".*"); } else { sb.append("[^/]*");; } break; case '?': sb.append('.'); break; case '.': case '[': case '$': case '^': case '&': case '+': sb.append('\\'); sb.append(thisCh); break; default: sb.append(thisCh); break; } } /* trailing / implies /.* */ if (antRegex.charAt(len - 1) == '/') { sb.append(".*"); } /* pattern must cover the whole string */ sb.append("$"); return sb.toString(); } /*-------------------------------------------------------------------------------------*/ /** * Returns true or false to indicate whether a candidate string matches the provided * regular expression. * * @param stringToMatch The string to compare against the pattern. * @param regex The regular expression in Java regex syntax. * @return True if the string matches the pattern, else false. * @throws PatternSyntaxException If regex has an invalid format. */ public static boolean matchRegex(String stringToMatch, String regex) throws PatternSyntaxException { Pattern pattern = Pattern.compile(regex); Matcher matcher = pattern.matcher(stringToMatch); return matcher.matches(); } /*-------------------------------------------------------------------------------------*/ /** * Returns true or false to indicate whether a candidate string matches the provided * regular expression (in Ant regex syntax). * * @param stringToMatch The string to compare against the pattern. * @param antRegex The regular expression in Ant syntax (using *, ?, **) * @return True if the string matches the pattern, else false. * @throws PatternSyntaxException If antRegex has an invalid format. */ public static boolean matchAntRegex(String stringToMatch, String antRegex) throws PatternSyntaxException { String regex = convertAntToJavaRegex(antRegex); Pattern pattern = Pattern.compile(regex); Matcher matcher = pattern.matcher(stringToMatch); return matcher.matches(); } /*-------------------------------------------------------------------------------------*/ /** * Given an array of regex pattern strings, compile the strings into a single RegexChain * object. A chain is a sequence of regular expressions that can be matched against a * candidate string. When matching a string against a chain, we pay attention to whether * the chain members "include" or "exclude" the string. This will determine whether the * candidate string should match or not-match the whole RegexChain. * * Each entry in the chain must start with a prefix to state the type of the regex. Valid * prefixes are: * "ia:" - An "inclusion" rule in Ant syntax. * "ea:" - An "exclusion" rule in Ant syntax. * "ij:" - An "inclusion" rule in Java syntax. * "ej:" - An "exclusion" rule in Java syntax. * * @param regexChain An array of regular expressions, each string prefixed by one of the * above prefixes. * @return The RegexChain object. * @throws PatternSyntaxException If one of the input patterns has an invalid format. */ public static RegexChain compileRegexChain(String regexChain[]) throws PatternSyntaxException { RegexChain chain = new RegexChain(); if (regexChain == null) { return chain; } for (int i = 0; i < regexChain.length; i++) { String regexString = regexChain[i]; if (regexString == null) { throw new PatternSyntaxException("Invalid regex string", regexString, 0); } String parts[] = regexString.split(":"); int regexType; String regexExpr; /* includes regex using Ant syntax */ if ("ia".equals(parts[0])) { regexType = RegexChain.TYPE_INCLUDES; regexExpr = convertAntToJavaRegex(parts[1]); } /* excludes regex using Ant syntax */ else if ("ea".equals(parts[0])) { regexType = RegexChain.TYPE_EXCLUDES; regexExpr = convertAntToJavaRegex(parts[1]); } /* includes regex using Java syntax */ else if ("ij".equals(parts[0])) { regexType = RegexChain.TYPE_INCLUDES; regexExpr = parts[1]; } /* excludes regex using Java syntax */ else if ("ej".equals(parts[0])) { regexType = RegexChain.TYPE_EXCLUDES; regexExpr = parts[1]; } /* other prefixes are invalid */ else { throw new PatternSyntaxException("Invalid regex prefix", regexString, 0); } /* convert the regex to a Pattern, and add it to our chain */ Pattern pattern = Pattern.compile(regexExpr); chain.addEntry(regexType, pattern); } return chain; } /*-------------------------------------------------------------------------------------*/ /** * Match a string against a regular expression chain. Rules in the chain visited in order * they're provided (within the array). If an "include" matches the pattern, the result * is true, unless there's an "exclude" that later invalidates the match. Strings that * have been explicitly excluded can not be included again by a later pattern. * * Initially the string will be in the excluded state so if it never matches any "include" * regexes, the result will be false. * * @param stringToMatch The string we're matching against the regex chain. * @param regexChain The chain of regular expressions to match against. * @return True if the stringToMatch is included by the chain, else false. */ public static boolean matchRegexChain(String stringToMatch, RegexChain regexChain) { if (regexChain == null) { return false; } /* strings are excluded by default */ boolean included = false; int len = regexChain.getSize(); /* traverse the chain until the end, or until the string is excluded */ for (int i = 0; i != len; i++) { int regexType = regexChain.getType(i); Pattern regexPattern = regexChain.getPattern(i); Matcher m = regexPattern.matcher(stringToMatch); if (m.matches()) { /* if we've matched an "include", we still need to check for future excludes */ if (regexType == RegexChain.TYPE_INCLUDES) { included = true; } /* we matched an "exclude", there's no way to re-include at this point, so end now */ else if (regexType == RegexChain.TYPE_EXCLUDES) { return false; } } } return included; } /*-------------------------------------------------------------------------------------*/ /** * Match an array of strings against a regular expression chain and return a new array * containing only the strings that match. If an "include" matches the pattern, the result * is true, unless there's an "exclude" that later invalidates the match. Strings that * have been explicitly excluded can not be included again by a later pattern. * * @param stringsToMatch The array of string we're matching against the regex chain. * @param regexChain The chain of regular expressions to match against. * @return The array of strings containing only those input strings that matched the regex chain, * or null if any of the inputs are invalid. */ public static String[] filterRegexChain(String stringsToMatch[], RegexChain regexChain) { if ((regexChain == null) || (stringsToMatch == null)) { return null; } ArrayList<String> result = new ArrayList<String>(); for (int i = 0; i < stringsToMatch.length; i++) { if (matchRegexChain(stringsToMatch[i], regexChain)) { result.add(stringsToMatch[i]); } } return result.toArray(new String[result.size()]); } /*-------------------------------------------------------------------------------------*/ }