package info.ephyra.util; import info.ephyra.nlp.NETagger; import info.ephyra.nlp.SnowballStemmer; import info.ephyra.nlp.indices.FunctionWords; import java.util.ArrayList; import java.util.Arrays; import java.util.Comparator; import java.util.HashSet; /** * A collection of utilities for string processing. * * @author Nico Schlaefer * @version 2007-05-05 */ // TODO use Levenstein distance to identify similar tokens public class StringUtils { /** * Fraction of words that must occur in both strings for * <code>equalsIntersect()</code> to be true. */ private static final float INTERSECT_THRESH = 0.33f; /** * Checks if the first array of tokens is a subset if the second array. * * @param tokens1 token array 1 * @param tokens2 token array 2 * * @return true, iff ss1 is a subset of ss2 */ private static boolean isSubset(String[] tokens1, String[] tokens2) { boolean exists; for (String token1 : tokens1) { exists = false; for (String token2 : tokens2) if (token1.equals(token2)) { exists = true; break; } if (!exists) return false; } return true; } /** * Checks if the tokens in the first string form a subset of the tokens in * the second string. * * @param s1 string 1 * @param s2 string 2 * @return true, iff the tokens in s1 are a subset of the tokens in s2 */ public static boolean isSubset(String s1, String s2) { if (s1 == null) return true; if (s2 == null) return false; String[] tokens1 = s1.split(" "); String[] tokens2 = s2.split(" "); return isSubset(tokens1, tokens2); } /** * Checks if the tokens in the first string form a subset of the tokens in * the second string. Function words and tokens of length less than 2 are * ignored. * * @param s1 string 1 * @param s2 string 2 * @return true, iff the keywords in s1 are a subset of the tokens in s2 */ public static boolean isSubsetKeywords(String s1, String s2) { if (s1 == null) return true; if (s2 == null) return false; String[] tokens1 = s1.split(" "); String[] tokens2 = s2.split(" "); // eliminate function words and tokens of length < 2 from tokens1 ArrayList<String> tks1 = new ArrayList<String>(); for (String token1 : tokens1) if (token1.length() > 1 && !FunctionWords.lookup(token1)) tks1.add(token1); tokens1 = tks1.toArray(new String[tks1.size()]); return isSubset(tokens1, tokens2); } /** * Concatenates an array of strings, using the given delimiter. * * @param ss array of strings * @param delim delimiter * @return concatenated string */ public static String concat(String[] ss, String delim) { String s = ""; if (ss.length > 0) s += ss[0]; for (int i = 1; i < ss.length; i++) s += delim + ss[i]; return s; } /** * Concatenates an array of strings, using whitespaces as delimiters. * * @param ss array of strings * @return concatenated string */ public static String concatWithSpaces(String[] ss) { String s = ""; if (ss.length > 0) s += ss[0]; for (int i = 1; i < ss.length; i++) s += " " + ss[i]; return s; } /** * Concatenates an array of strings, using tabs as delimiters. * * @param ss array of strings * @return concatenated string */ public static String concatWithTabs(String[] ss) { String s = ""; if (ss.length > 0) s += ss[0]; for (int i = 1; i < ss.length; i++) s += "\t" + ss[i]; return s; } /** * Repeats string <code>s</code> <code>n</code> times. * * @param s a string * @param n number of repetitions */ public static String repeat(String s, int n) { String repeated = ""; for (int i = 0; i < n; i++) repeated += s; return repeated; } /** * Normalizes a string. Similar strings are mapped to equal normalizations. * * @param s the string * @return normalized string */ // TODO use noun and verb stemming (also for equals...Norm() methods) public static String normalize(String s) { // convert to lower-case s = s.toLowerCase(); // tokenize String tokens[] = NETagger.tokenize(s); // stemm all tokens for (int i = 0; i < tokens.length; i++) tokens[i] = SnowballStemmer.stem(tokens[i]); return concatWithSpaces(tokens); } /** * Compares the normalizations of the two strings, using the standard * <code>String.equals()</code> method. * * @param s1 string 1 * @param s2 string 2 * @return true, iff the normalizations are equal */ public static boolean equalsNorm(String s1, String s2) { return normalize(s1).equals(normalize(s2)); } /** * Compares two strings. The strings are considered equal, iff one of the * strings is a subset of the other string, i.e. iff all the tokens in the * one string also occur in the other string. * * @param s1 string 1 * @param s2 string 2 * @return true, iff the strings are equal in the sense defined above */ public static boolean equalsSubset(String s1, String s2) { return isSubset(s1, s2) || isSubset(s2, s1); } /** * Compares the normalizations of the two strings, using the * <code>equalsSubset()</code> method. * * @param s1 string 1 * @param s2 string 2 * @return true, iff the normalizations are equal */ public static boolean equalsSubsetNorm(String s1, String s2) { return equalsSubset(normalize(s1), normalize(s2)); } /** * Compares two strings. The strings are considered equal, iff the number of * words that occur in both strings over the total number of words is at * least <code>INTERSECT_FRAC</code>. * * @param s1 string 1 * @param s2 string 2 * @return true, iff the strings are equal in the sense defined above */ public static boolean equalsIntersect(String s1, String s2) { // tokenize both strings String[] tokens1 = s1.split(" "); String[] tokens2 = s2.split(" "); // number of common tokens and total number of tokens // (note that duplicates are not handled properly) int commonTokens = 0; int totalTokens = tokens2.length; for (String token1 : tokens1) for (String token2 : tokens2) if (token1.equals(token2)) commonTokens++; else totalTokens++; return ((float) commonTokens) / totalTokens >= INTERSECT_THRESH; } /** * Compares the normalizations of the two strings, using the * <code>equalsIntersect()</code> method. * * @param s1 string 1 * @param s2 string 2 * @return true, iff the normalizations are equal */ public static boolean equalsIntersectNorm(String s1, String s2) { return equalsIntersect(normalize(s1), normalize(s2)); } /** * Compares two strings. The strings are considered equal, iff they have a * common token. Function words and tokens of length less than 2 are * ignored. * * @param s1 string 1 * @param s2 string 2 * @return true, iff the strings are equal in the sense defined above */ public static boolean equalsCommon(String s1, String s2) { // tokenize both strings String[] tokens1 = s1.split(" "); String[] tokens2 = s2.split(" "); // eliminate function words and tokens of length < 2 ArrayList<String> tks1 = new ArrayList<String>(); for (String token1 : tokens1) if (token1.length() > 1 && !FunctionWords.lookup(token1)) tks1.add(token1); HashSet<String> tks2 = new HashSet<String>(); for (String token2 : tokens2) if (token2.length() > 1 && !FunctionWords.lookup(token2)) tks2.add(token2); // check for common token for (String token : tks1) if (tks2.contains(token)) return true; return false; } /** * Compares the normalizations of the two strings, using the same criterion * as the <code>equalsCommon()</code> method. * * @param s1 string 1 * @param s2 string 2 * @return true, iff the normalizations are equal */ public static boolean equalsCommonNorm(String s1, String s2) { // convert to lower-case s1 = s1.toLowerCase(); s2 = s2.toLowerCase(); // tokenize String tokens1[] = NETagger.tokenize(s1); String tokens2[] = NETagger.tokenize(s2); // eliminate function words and tokens of length < 2, stemm all tokens ArrayList<String> tks1 = new ArrayList<String>(); for (String token1 : tokens1) if (token1.length() > 1 && !FunctionWords.lookup(token1)) tks1.add(SnowballStemmer.stem(token1)); HashSet<String> tks2 = new HashSet<String>(); for (String token2 : tokens2) if (token2.length() > 1 && !FunctionWords.lookup(token2)) tks2.add(SnowballStemmer.stem(token2)); // check for common token for (String token : tks1) if (tks2.contains(token)) return true; return false; } /** * Compares two strings, using the same criterion as the <code>equalsCommonNorm()</code> method, but considers only words starting with a capital letter (proper nouns) * * @param s1 string 1 * @param s2 string 2 * @return true, iff the proper nouns are equal */ public static boolean equalsCommonProp(String s1, String s2) { // convert to lower-case s1 = s1.toLowerCase(); s2 = s2.toLowerCase(); // tokenize String tokens1[] = NETagger.tokenize(s1); String tokens2[] = NETagger.tokenize(s2); // eliminate function words and tokens of length < 2, stemm all tokens ArrayList<String> tks1 = new ArrayList<String>(); for (String token1 : tokens1) if (token1.length() > 1 && !FunctionWords.lookup(token1) && token1.substring(0, 1).matches("[A-Z]")) tks1.add(SnowballStemmer.stem(token1)); HashSet<String> tks2 = new HashSet<String>(); for (String token2 : tokens2) if (token2.length() > 1 && !FunctionWords.lookup(token2) && token2.substring(0, 1).matches("[A-Z]")) tks2.add(SnowballStemmer.stem(token2)); // check for common token for (String token : tks1) if (tks2.contains(token)) return true; return false; } /** * Replaces all substrings of <code>s</code> that match <code>s1</code> with * <code>s2</code>. This method is similar to <code>String.replace()</code>, * but it ignores the case of <code>s1</code>. * * @param s the string * @param s1 the substring to be replaced * @param s2 the replacement for the substring * @return modified string */ public static String replaceIgnoreCase(String s, String s1, String s2) { return s.replaceAll("(?i)" + RegexConverter.strToRegex(s1), RegexConverter.strToRegex(s2)); } /** * <p>Sorts an array of strings by their length in ascending order.</p> * * <p>This sort is guaranteed to be stable: strings of equal length are not * reordered.</p> * * @param ss array of strings */ public static void sortByLength(String[] ss) { Comparator<String> lengthC = new Comparator<String>() { public int compare(String s1, String s2) { return s1.length() - s2.length(); } }; Arrays.sort(ss, lengthC); } /** * <p>Sorts an array of strings by their length in descending order.</p> * * <p>This sort is guaranteed to be stable: strings of equal length are not * reordered.</p> * * @param ss array of strings */ public static void sortByLengthDesc(String[] ss) { Comparator<String> lengthC = new Comparator<String>() { public int compare(String s1, String s2) { return s2.length() - s1.length(); } }; Arrays.sort(ss, lengthC); } }