package info.ephyra.util;
/**
* The <code>RegexConverter</code> can be used to tranform a string into a
* regular expression and to build a query string from a regular expression.
*
* @author Nico Schlaefer
* @version 2006-04-12
*/
public class RegexConverter {
/**
* Transform a string into a regular expression that matches the string.
* This is done by replacing all metacharacters 'C' by '\\C'.
*
* @param expr a string
* @return a regular expression that matches the string
*/
public static String strToRegex(String expr) {
expr = expr.replace("\\", "\\\\").replace("|", "\\|");
expr = expr.replace("*", "\\*").replace("+", "\\+");
expr = expr.replace("?", "\\?").replace(".", "\\.");
expr = expr.replace("^", "\\^").replace("$", "\\$");
expr = expr.replace("(", "\\(").replace(")", "\\)");
expr = expr.replace("{", "\\{").replace("}", "\\}");
expr = expr.replace("[", "\\[").replace("]", "\\]");
return expr;
}
/**
* Transforms a string into a regular expression that describes a regular
* expression that matches the string by applying the method
* <code>strToRegex()</code> twice.
*
* @param expr a string
* @return a regular expression that describes a regular expression that
* matches the string
*/
public static String strToRegex2(String expr){
return strToRegex(strToRegex(expr));
}
/**
* Transforms a string into a regular expressions and adds word boundaries
* if the first/last character is a word character.
*
* @param expr a string
* @return a regular expression with word boundaries
*/
public static String strToRegexWithBounds(String expr) {
if (expr.length() == 0) return expr;
expr = strToRegex(expr);
if (expr.substring(0,1).matches("\\w"))
expr = "\\b" + expr;
if (expr.substring(expr.length() - 1, expr.length()).matches("\\w"))
expr += "\\b";
return expr;
}
/**
* Returns a query string that is derived from the regular expression. When
* learning patterns, this string should be part of the query string to
* ensure that both the TARGET and the seeked PROPERTY occur in the passages
* that are obtained from the search engine.
*/
// TODO improve this
public static String regexToQueryStr(String regex) {
// drop '\' before metacharacters
//expr = expr.replace("\\(", "(").replace("\\)", ")");
regex = regex.replace("\\{", "{").replace("\\}", "}");
regex = regex.replace("\\[", "[").replace("\\]", "]");
regex = regex.replace("\\^", "^").replace("\\$", "$");
regex = regex.replace("\\.", ".").replace("\\|", "|");
regex = regex.replace("\\*", "*").replace("\\+", "+");
//expr = expr.replace("\\?", "?");
// replace "\s+" by single blank
regex = regex.replace("\\s+", " ");
// drop substrings that contain '.' or '\'
regex = regex.replaceAll("[^\\(\\)\\s\\|]*\\.\\\\[^\\(\\)\\s\\|]*", "");
// replace "(?:" by simple '('
regex = regex.replace("(?:", "(");
// replace ")?" by "|)"
regex = regex.replace(")?", "|)");
return regex;
}
}