package info.ephyra.util;
import info.ephyra.nlp.NETagger;
import info.ephyra.nlp.SnowballStemmer;
import info.ephyra.nlp.indices.FunctionWords;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Comparator;
import java.util.HashSet;
/**
* A collection of utilities for string processing.
*
* @author Nico Schlaefer
* @version 2007-05-05
*/
// TODO use Levenstein distance to identify similar tokens
public class StringUtils {
/**
* Fraction of words that must occur in both strings for
* <code>equalsIntersect()</code> to be true.
*/
private static final float INTERSECT_THRESH = 0.33f;
/**
* Checks if the first array of tokens is a subset if the second array.
*
* @param tokens1 token array 1
* @param tokens2 token array 2
*
* @return true, iff ss1 is a subset of ss2
*/
private static boolean isSubset(String[] tokens1, String[] tokens2) {
boolean exists;
for (String token1 : tokens1) {
exists = false;
for (String token2 : tokens2)
if (token1.equals(token2)) {
exists = true;
break;
}
if (!exists) return false;
}
return true;
}
/**
* Checks if the tokens in the first string form a subset of the tokens in
* the second string.
*
* @param s1 string 1
* @param s2 string 2
* @return true, iff the tokens in s1 are a subset of the tokens in s2
*/
public static boolean isSubset(String s1, String s2) {
if (s1 == null) return true;
if (s2 == null) return false;
String[] tokens1 = s1.split(" ");
String[] tokens2 = s2.split(" ");
return isSubset(tokens1, tokens2);
}
/**
* Checks if the tokens in the first string form a subset of the tokens in
* the second string. Function words and tokens of length less than 2 are
* ignored.
*
* @param s1 string 1
* @param s2 string 2
* @return true, iff the keywords in s1 are a subset of the tokens in s2
*/
public static boolean isSubsetKeywords(String s1, String s2) {
if (s1 == null) return true;
if (s2 == null) return false;
String[] tokens1 = s1.split(" ");
String[] tokens2 = s2.split(" ");
// eliminate function words and tokens of length < 2 from tokens1
ArrayList<String> tks1 = new ArrayList<String>();
for (String token1 : tokens1)
if (token1.length() > 1 && !FunctionWords.lookup(token1))
tks1.add(token1);
tokens1 = tks1.toArray(new String[tks1.size()]);
return isSubset(tokens1, tokens2);
}
/**
* Concatenates an array of strings, using the given delimiter.
*
* @param ss array of strings
* @param delim delimiter
* @return concatenated string
*/
public static String concat(String[] ss, String delim) {
String s = "";
if (ss.length > 0) s += ss[0];
for (int i = 1; i < ss.length; i++) s += delim + ss[i];
return s;
}
/**
* Concatenates an array of strings, using whitespaces as delimiters.
*
* @param ss array of strings
* @return concatenated string
*/
public static String concatWithSpaces(String[] ss) {
String s = "";
if (ss.length > 0) s += ss[0];
for (int i = 1; i < ss.length; i++) s += " " + ss[i];
return s;
}
/**
* Concatenates an array of strings, using tabs as delimiters.
*
* @param ss array of strings
* @return concatenated string
*/
public static String concatWithTabs(String[] ss) {
String s = "";
if (ss.length > 0) s += ss[0];
for (int i = 1; i < ss.length; i++) s += "\t" + ss[i];
return s;
}
/**
* Repeats string <code>s</code> <code>n</code> times.
*
* @param s a string
* @param n number of repetitions
*/
public static String repeat(String s, int n) {
String repeated = "";
for (int i = 0; i < n; i++) repeated += s;
return repeated;
}
/**
* Normalizes a string. Similar strings are mapped to equal normalizations.
*
* @param s the string
* @return normalized string
*/
// TODO use noun and verb stemming (also for equals...Norm() methods)
public static String normalize(String s) {
// convert to lower-case
s = s.toLowerCase();
// tokenize
String tokens[] = NETagger.tokenize(s);
// stemm all tokens
for (int i = 0; i < tokens.length; i++)
tokens[i] = SnowballStemmer.stem(tokens[i]);
return concatWithSpaces(tokens);
}
/**
* Compares the normalizations of the two strings, using the standard
* <code>String.equals()</code> method.
*
* @param s1 string 1
* @param s2 string 2
* @return true, iff the normalizations are equal
*/
public static boolean equalsNorm(String s1, String s2) {
return normalize(s1).equals(normalize(s2));
}
/**
* Compares two strings. The strings are considered equal, iff one of the
* strings is a subset of the other string, i.e. iff all the tokens in the
* one string also occur in the other string.
*
* @param s1 string 1
* @param s2 string 2
* @return true, iff the strings are equal in the sense defined above
*/
public static boolean equalsSubset(String s1, String s2) {
return isSubset(s1, s2) || isSubset(s2, s1);
}
/**
* Compares the normalizations of the two strings, using the
* <code>equalsSubset()</code> method.
*
* @param s1 string 1
* @param s2 string 2
* @return true, iff the normalizations are equal
*/
public static boolean equalsSubsetNorm(String s1, String s2) {
return equalsSubset(normalize(s1), normalize(s2));
}
/**
* Compares two strings. The strings are considered equal, iff the number of
* words that occur in both strings over the total number of words is at
* least <code>INTERSECT_FRAC</code>.
*
* @param s1 string 1
* @param s2 string 2
* @return true, iff the strings are equal in the sense defined above
*/
public static boolean equalsIntersect(String s1, String s2) {
// tokenize both strings
String[] tokens1 = s1.split(" ");
String[] tokens2 = s2.split(" ");
// number of common tokens and total number of tokens
// (note that duplicates are not handled properly)
int commonTokens = 0;
int totalTokens = tokens2.length;
for (String token1 : tokens1)
for (String token2 : tokens2)
if (token1.equals(token2)) commonTokens++; else totalTokens++;
return ((float) commonTokens) / totalTokens >= INTERSECT_THRESH;
}
/**
* Compares the normalizations of the two strings, using the
* <code>equalsIntersect()</code> method.
*
* @param s1 string 1
* @param s2 string 2
* @return true, iff the normalizations are equal
*/
public static boolean equalsIntersectNorm(String s1, String s2) {
return equalsIntersect(normalize(s1), normalize(s2));
}
/**
* Compares two strings. The strings are considered equal, iff they have a
* common token. Function words and tokens of length less than 2 are
* ignored.
*
* @param s1 string 1
* @param s2 string 2
* @return true, iff the strings are equal in the sense defined above
*/
public static boolean equalsCommon(String s1, String s2) {
// tokenize both strings
String[] tokens1 = s1.split(" ");
String[] tokens2 = s2.split(" ");
// eliminate function words and tokens of length < 2
ArrayList<String> tks1 = new ArrayList<String>();
for (String token1 : tokens1)
if (token1.length() > 1 && !FunctionWords.lookup(token1))
tks1.add(token1);
HashSet<String> tks2 = new HashSet<String>();
for (String token2 : tokens2)
if (token2.length() > 1 && !FunctionWords.lookup(token2))
tks2.add(token2);
// check for common token
for (String token : tks1) if (tks2.contains(token)) return true;
return false;
}
/**
* Compares the normalizations of the two strings, using the same criterion
* as the <code>equalsCommon()</code> method.
*
* @param s1 string 1
* @param s2 string 2
* @return true, iff the normalizations are equal
*/
public static boolean equalsCommonNorm(String s1, String s2) {
// convert to lower-case
s1 = s1.toLowerCase();
s2 = s2.toLowerCase();
// tokenize
String tokens1[] = NETagger.tokenize(s1);
String tokens2[] = NETagger.tokenize(s2);
// eliminate function words and tokens of length < 2, stemm all tokens
ArrayList<String> tks1 = new ArrayList<String>();
for (String token1 : tokens1)
if (token1.length() > 1 && !FunctionWords.lookup(token1))
tks1.add(SnowballStemmer.stem(token1));
HashSet<String> tks2 = new HashSet<String>();
for (String token2 : tokens2)
if (token2.length() > 1 && !FunctionWords.lookup(token2))
tks2.add(SnowballStemmer.stem(token2));
// check for common token
for (String token : tks1) if (tks2.contains(token)) return true;
return false;
}
/**
* Compares two strings, using the same criterion as the <code>equalsCommonNorm()</code> method, but considers only words starting with a capital letter (proper nouns)
*
* @param s1 string 1
* @param s2 string 2
* @return true, iff the proper nouns are equal
*/
public static boolean equalsCommonProp(String s1, String s2) {
// convert to lower-case
s1 = s1.toLowerCase();
s2 = s2.toLowerCase();
// tokenize
String tokens1[] = NETagger.tokenize(s1);
String tokens2[] = NETagger.tokenize(s2);
// eliminate function words and tokens of length < 2, stemm all tokens
ArrayList<String> tks1 = new ArrayList<String>();
for (String token1 : tokens1)
if (token1.length() > 1 && !FunctionWords.lookup(token1) && token1.substring(0, 1).matches("[A-Z]"))
tks1.add(SnowballStemmer.stem(token1));
HashSet<String> tks2 = new HashSet<String>();
for (String token2 : tokens2)
if (token2.length() > 1 && !FunctionWords.lookup(token2) && token2.substring(0, 1).matches("[A-Z]"))
tks2.add(SnowballStemmer.stem(token2));
// check for common token
for (String token : tks1) if (tks2.contains(token)) return true;
return false;
}
/**
* Replaces all substrings of <code>s</code> that match <code>s1</code> with
* <code>s2</code>. This method is similar to <code>String.replace()</code>,
* but it ignores the case of <code>s1</code>.
*
* @param s the string
* @param s1 the substring to be replaced
* @param s2 the replacement for the substring
* @return modified string
*/
public static String replaceIgnoreCase(String s, String s1, String s2) {
return s.replaceAll("(?i)" + RegexConverter.strToRegex(s1),
RegexConverter.strToRegex(s2));
}
/**
* <p>Sorts an array of strings by their length in ascending order.</p>
*
* <p>This sort is guaranteed to be stable: strings of equal length are not
* reordered.</p>
*
* @param ss array of strings
*/
public static void sortByLength(String[] ss) {
Comparator<String> lengthC = new Comparator<String>() {
public int compare(String s1, String s2) {
return s1.length() - s2.length();
}
};
Arrays.sort(ss, lengthC);
}
/**
* <p>Sorts an array of strings by their length in descending order.</p>
*
* <p>This sort is guaranteed to be stable: strings of equal length are not
* reordered.</p>
*
* @param ss array of strings
*/
public static void sortByLengthDesc(String[] ss) {
Comparator<String> lengthC = new Comparator<String>() {
public int compare(String s1, String s2) {
return s2.length() - s1.length();
}
};
Arrays.sort(ss, lengthC);
}
}