package com.limegroup.gnutella.util;
import java.text.Collator;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Locale;
import java.util.Set;
import java.util.StringTokenizer;
import java.util.Vector;
import com.limegroup.gnutella.Assert;
import com.limegroup.gnutella.Constants;
import com.limegroup.gnutella.FileManager;
import com.limegroup.gnutella.settings.ApplicationSettings;
import com.limegroup.gnutella.settings.SearchSettings;
/** Various static routines for manipulating strings.*/
public class StringUtils {
/**
* Trivial words that are not considered keywords.
*/
private static final List TRIVIAL_WORDS;
/**
* Collator used for internationalization.
*/
private final static Collator COLLATOR;
static {
TRIVIAL_WORDS = new ArrayList(3);
TRIVIAL_WORDS.add("the"); //must be lower-case
TRIVIAL_WORDS.add("an");
TRIVIAL_WORDS.add("a");
TRIVIAL_WORDS.add("and");
COLLATOR = Collator.getInstance
(new Locale(ApplicationSettings.LANGUAGE.getValue(),
ApplicationSettings.COUNTRY.getValue(),
ApplicationSettings.LOCALE_VARIANT.getValue()));
COLLATOR.setDecomposition(Collator.FULL_DECOMPOSITION);
COLLATOR.setStrength(Collator.PRIMARY);
}
/** Returns true if input contains the given pattern, which may contain the
* wildcard character '*'. TODO: need more formal definition. Examples:
*
* <pre>
* StringUtils.contains("", "") ==> true
* StringUtils.contains("abc", "") ==> true
* StringUtils.contains("abc", "b") ==> true
* StringUtils.contains("abc", "d") ==> false
* StringUtils.contains("abcd", "a*d") ==> true
* StringUtils.contains("abcd", "*a**d*") ==> true
* StringUtils.contains("abcd", "d*a") ==> false
* </pre>
*/
public static final boolean contains(String input, String pattern) {
return contains(input, pattern, false);
}
/** Exactly like contains(input, pattern), but case is ignored if
* ignoreCase==true. */
public static final boolean contains(String input, String pattern,
boolean ignoreCase) {
//More efficient algorithms are possible, e.g. a modified version of the
//Rabin-Karp algorithm, but they are unlikely to be faster with such
//short strings. Also, some contant time factors could be shaved by
//combining the second FOR loop below with the subset(..) call, but that
//just isn't important. The important thing is to avoid needless
//allocations.
final int n=pattern.length();
//Where to resume searching after last wildcard, e.g., just past
//the last match in input.
int last=0;
//For each token in pattern starting at i...
for (int i=0; i<n; ) {
//1. Find the smallest j>i s.t. pattern[j] is space, *, or +.
char c=' ';
int j=i;
for ( ; j<n; j++) {
char c2=pattern.charAt(j);
if (c2==' ' || c2=='+' || c2=='*') {
c=c2;
break;
}
}
//2. Match pattern[i..j-1] against input[last...].
int k=subset(pattern, i, j,
input, last,
ignoreCase);
if (k<0)
return false;
//3. Reset the starting search index if got ' ' or '+'.
//Otherwise increment past the match in input.
if (c==' ' || c=='+')
last=0;
else if (c=='*')
last=k+j-i;
i=j+1;
}
return true;
}
public static boolean containsCharacters(String input, char [] chars) {
char [] inputChars = input.toCharArray();
Arrays.sort(inputChars);
for(int i=0; i<chars.length; i++) {
if(Arrays.binarySearch(inputChars, chars[i]) >= 0) return true;
}
return false;
}
/**
* @requires TODO3: fill this in
* @effects returns the the smallest i>=bigStart
* s.t. little[littleStart...littleStop-1] is a prefix of big[i...]
* or -1 if no such i exists. If ignoreCase==false, case doesn't matter
* when comparing characters.
*/
private static final int subset(String little, int littleStart, int littleStop,
String big, int bigStart,
boolean ignoreCase) {
//Equivalent to
// return big.indexOf(little.substring(littleStart, littleStop), bigStart);
//but without an allocation.
//Note special case for ignoreCase below.
if (ignoreCase) {
final int n=big.length()-(littleStop-littleStart)+1;
outerLoop:
for (int i=bigStart; i<n; i++) {
//Check if little[littleStart...littleStop-1] matches with shift i
final int n2=littleStop-littleStart;
for (int j=0 ; j<n2 ; j++) {
char c1=big.charAt(i+j);
char c2=little.charAt(littleStart+j);
if (c1!=c2 && c1!=toOtherCase(c2)) //Ignore case. See below.
continue outerLoop;
}
return i;
}
return -1;
} else {
final int n=big.length()-(littleStop-littleStart)+1;
outerLoop:
for (int i=bigStart; i<n; i++) {
final int n2=littleStop-littleStart;
for (int j=0 ; j<n2 ; j++) {
char c1=big.charAt(i+j);
char c2=little.charAt(littleStart+j);
if (c1!=c2) //Consider case. See above.
continue outerLoop;
}
return i;
}
return -1;
}
}
/** If c is a lower case ASCII character, returns Character.toUpperCase(c).
* Else if c is an upper case ASCII character, returns Character.toLowerCase(c),
* Else returns c.
* Note that this is <b>not internationalized</b>; but it is fast.
*/
public static final char toOtherCase(char c) {
int i=(int)c;
final int A=(int)'A'; //65
final int Z=(int)'Z'; //90
final int a=(int)'a'; //97
final int z=(int)'z'; //122
final int SHIFT=a-A;
if (i<A) //non alphabetic
return c;
else if (i<=Z) //upper-case
return (char)(i+SHIFT);
else if (i<a) //non alphabetic
return c;
else if (i<=z) //lower-case
return (char)(i-SHIFT);
else //non alphabetic
return c;
}
/**
* Exactly like split(s, Character.toString(delimiter))
*/
public static String[] split(String s, char delimiter) {
//Character.toString only available in Java 1.4+
return split(s, delimiter+"");
}
/**
* Returns the tokens of s delimited by the given delimiter, without
* returning the delimiter. Repeated sequences of delimiters are treated
* as one. Examples:
* <pre>
* split("a//b/ c /","/")=={"a","b"," c "}
* split("a b", "/")=={"a b"}.
* split("///", "/")=={}.
* </pre>
*
* <b>Note that whitespace is preserved if it is not part of the delimiter.</b>
* An older version of this trim()'ed each token of whitespace.
*/
public static String[] split(String s, String delimiters) {
//Tokenize s based on delimiters, adding to buffer.
StringTokenizer tokenizer = new StringTokenizer(s, delimiters);
Vector buf = new Vector();
while (tokenizer.hasMoreTokens())
buf.add(tokenizer.nextToken());
//Copy from buffer to array.
String[] ret = new String[buf.size()];
for(int i=0; i<buf.size(); i++)
ret[i] = (String)buf.get(i);
return ret;
}
/**
* Exactly like splitNoCoalesce(s, Character.toString(delimiter))
*/
public static String[] splitNoCoalesce(String s, char delimiter) {
//Character.toString only available in Java 1.4+
return splitNoCoalesce(s, delimiter+"");
}
/**
* Similar to split(s, delimiters) except that subsequent delimiters are not
* coalesced, so the returned array may contain empty strings. If s starts
* (ends) with a delimiter, the returned array starts (ends) with an empty
* strings. If s contains N delimiters, N+1 strings are always returned.
* Examples:
*
* <pre>
* split("a//b/ c /","/")=={"a","","b"," c ", ""}
* split("a b", "/")=={"a b"}.
* split("///", "/")=={"","","",""}.
* </pre>
*
* @return an array A s.t. s.equals(A[0]+d0+A[1]+d1+...+A[N]), where
* for all dI, dI.size()==1 && delimiters.indexOf(dI)>=0; and for
* all c in A[i], delimiters.indexOf(c)<0
*/
public static String[] splitNoCoalesce(String s, String delimiters) {
//Tokenize s based on delimiters, adding to buffer.
StringTokenizer tokenizer = new StringTokenizer(s, delimiters, true);
Vector buf = new Vector();
//True if last token was a delimiter. Initialized to true to force
//an empty string if s starts with a delimiter.
boolean gotDelimiter=true;
while (tokenizer.hasMoreTokens()) {
String token=tokenizer.nextToken();
//Is token a delimiter?
if (token.length()==1 && delimiters.indexOf(token)>=0) {
//If so, add blank only if last token was a delimiter.
if (gotDelimiter)
buf.add("");
gotDelimiter=true;
} else {
//If not, add "real" token.
buf.add(token);
gotDelimiter=false;
}
}
//Add trailing empty string UNLESS s is the empty string.
if (gotDelimiter && !buf.isEmpty())
buf.add("");
//Copy from buffer to array.
String[] ret = new String[buf.size()];
for(int i=0; i<buf.size(); i++)
ret[i] = (String)buf.get(i);
return ret;
}
/** Exactly the same as s1.compareToIgnoreCase(s2), which unfortunately
* doesn't exist in Java 1.1.8. */
public static int compareIgnoreCase(String s1, String s2) {
//Check out String.compareTo(String) for a description of the basic
//algorithm. The ignore case extension is trivial.
//We need to compare both uppercase and lowercase characters because
//some characters have two distinct associated upper or lower cases
//or exist in title case (such as "Dz"). We start by comparing the
//upper case conversion because duplicate uppercases occur less often.
final int n1 = s1.length(), n2 = s2.length();
final int lim = Math.min(n1, n2);
for (int k = 0; k < lim; k++) {
char c1 = s1.charAt(k);
char c2 = s2.charAt(k);
if (c1 != c2) { // avoid conversion if characters are equal
c1 = Character.toUpperCase(c1);
c2 = Character.toUpperCase(c2);
if (c1 != c2) { // avoid conversion if uppercases are equal
c1 = Character.toLowerCase(c1);
c2 = Character.toLowerCase(c2);
if (c1 != c2) {
return c1 - c2;
}
}
}
}
return n1 - n2;
}
/**
* This method will compare the two strings using
* full decomposition and only look at primary differences
* The comparision will ignore case as well as
* differences like FULLWIDTH vs HALFWIDTH
*/
public static int compareFullPrimary(String s1, String s2) {
return COLLATOR.compare(s1, s2);
}
/**
* Returns true iff s starts with prefix, ignoring case.
* @return true iff s.toUpperCase().startsWith(prefix.toUpperCase())
*/
public static boolean startsWithIgnoreCase(String s, String prefix) {
final int pl = prefix.length();
if (s.length() < pl)
return false;
for (int i = 0; i < pl; i++) {
char sc = s.charAt(i);
char pc = prefix.charAt(i);
if (sc != pc) {
sc = Character.toUpperCase(sc);
pc = Character.toUpperCase(pc);
if (sc != pc) {
sc = Character.toLowerCase(sc);
pc = Character.toLowerCase(pc);
if (sc!=pc)
return false;
}
}
}
return true;
}
/**
* Returns the entries in the set in a string form, that can be used
* in HTTP headers (among other purposes)
* @param set The set whose entries are to be convereted to string form
* @return the entries in the set in a string form.
* e.g. For a collection with entries ("a", "b"), the string returned will
* be "a,b"
*/
public static String getEntriesAsString(Collection collection){
StringBuffer buffer = new StringBuffer();
boolean isFirstEntry = true;
//get the connected supernodes and pass them
for(Iterator iter = collection.iterator();iter.hasNext();){
//get the next entry
Object entry = iter.next();
//if the first entry that we are adding
if(!isFirstEntry){
//append separator to separate the entries
buffer.append(Constants.ENTRY_SEPARATOR);
}else{
//unset the flag
isFirstEntry = false;
}
//append the entry
buffer.append(entry.toString());
}
return buffer.toString();
}
/**
* Returns the entries passed in the string form as a Set fo strings
* @param values The string representation of entries to be split.
* The entries in the string are separated by Constants.ENTRY_SEPARATOR
* @return the entries in the set form.
* e.g. For string "a,b", the Set returned will have 2 entries:
* "a" & "b"
*/
public static Set getSetofValues(String values){
Set valueSet = new HashSet();
//tokenize the values
StringTokenizer st = new StringTokenizer(values,
Constants.ENTRY_SEPARATOR);
//add the values to the set
while(st.hasMoreTokens()){
valueSet.add(st.nextToken());
}
//return the set
return valueSet;
}
/**
* Replaces all occurrences of old_str in str with new_str
*
* @param str the String to modify
* @param old_str the String to be replaced
* @param new_str the String to replace old_str with
*
* @return the modified str.
*/
public static String replace(String str, String old_str, String new_str) {
int o = 0;
StringBuffer buf = new StringBuffer();
for (int i = str.indexOf(old_str) ; i > -1 ; i = str.indexOf(old_str, i+1)) {
if (i > o ) {
buf.append (str.substring(o, i));
}
buf.append (new_str);
o = i+old_str.length();
}
buf.append (str.substring(o, str.length()));
return buf.toString();
}
/**
* Returns a truncated string, up to the maximum number of characters
*/
public static String truncate(final String string, final int maxLen) {
if(string.length() <= maxLen)
return string;
else
return string.substring(0, maxLen);
}
/**
* Helper method to obtain the starting index of a substring within another
* string, ignoring their case. This method is expensive because it has
* to set each character of each string to lower case before doing the
* comparison.
*
* @param str the string in which to search for the <tt>substring</tt>
* argument
* @param substring the substring to search for in <tt>str</tt>
* @return if the <tt>substring</tt> argument occurs as a substring within
* <tt>str</tt>, then the index of the first character of the first such
* substring is returned; if it does not occur as a substring, -1 is
* returned
*/
public static int indexOfIgnoreCase(String str, String substring) {
// Look for the index after the expensive conversion to lower case.
return str.toLowerCase().indexOf(substring.toLowerCase());
}
/**
* Convenience wrapper for
* {@link #createQueryString(String, boolean) createQueryString(String, false)}.
* @param name
* @return
*/
public static String createQueryString(String name) {
return createQueryString(name, false);
}
/**
*
* Returns a string to be used for querying from the given name.
*
* @param name
* @param allowNumbers whether numbers in the argument should be kept in
* the result
* @return
*/
public static String createQueryString(String name, boolean allowNumbers) {
if(name == null)
throw new NullPointerException("null name");
String retString = null;
// normalize the name.
name = I18NConvert.instance().getNorm(name);
final int MAX_LEN = SearchSettings.MAX_QUERY_LENGTH.getValue();
//Get the set of keywords within the name.
Set intersection = keywords(name, allowNumbers);
if (intersection.size() < 1) { // nothing to extract!
retString = StringUtils.removeIllegalChars(name);
retString = StringUtils.truncate(retString, MAX_LEN);
} else {
StringBuffer sb = new StringBuffer();
int numWritten = 0;
Iterator keys = intersection.iterator();
for (; keys.hasNext() && (numWritten < MAX_LEN); ) {
String currKey = (String) keys.next();
// if we have space to add the keyword
if ((numWritten + currKey.length()) < MAX_LEN) {
if (numWritten > 0) { // add a space if we've written before
sb.append(" ");
numWritten++;
}
sb.append(currKey); // add the new keyword
numWritten += currKey.length();
}
}
retString = sb.toString();
//one small problem - if every keyword in the filename is
//greater than MAX_LEN, then the string returned will be empty.
//if this happens just truncate the first word....
if (retString.equals(""))
retString = StringUtils.truncate(name, MAX_LEN);
}
// Added a bunch of asserts to catch bugs. There is some form of
// input we are not considering in our algorithms....
Assert.that(retString.length() <= MAX_LEN,
"Original filename: " + name +
", converted: " + retString);
Assert.that(!retString.equals(""),
"Original filename: " + name);
Assert.that(retString != null,
"Original filename: " + name);
return retString;
}
/**
* Removes illegal characters from the name, inserting spaces instead.
*/
public static final String removeIllegalChars(String name) {
String ret = "";
String delim = FileManager.DELIMITERS;
char[] illegal = SearchSettings.ILLEGAL_CHARS.getValue();
StringBuffer sb = new StringBuffer(delim.length() + illegal.length);
sb.append(illegal).append(FileManager.DELIMITERS);
StringTokenizer st = new StringTokenizer(name, sb.toString());
while(st.hasMoreTokens())
ret += st.nextToken().trim() + " ";
return ret.trim();
}
/**
* Convenience wrapper for
* {@link #keywords(String, boolean) keywords(String, false)}.
* @param fileName
* @return
*/
public static final Set keywords(String fileName) {
return keywords(fileName, false);
}
/**
* Gets the keywords in this filename, seperated by delimiters & illegal
* characters.
*
* @param fileName
* @param allowNumbers whether number keywords are retained and returned
* in the result set
* @return
*/
public static final Set keywords(String fileName, boolean allowNumbers) {
//Remove extension
fileName = ripExtension(fileName);
//Separate by whitespace and _, etc.
Set ret=new LinkedHashSet();
String delim = FileManager.DELIMITERS;
char[] illegal = SearchSettings.ILLEGAL_CHARS.getValue();
StringBuffer sb = new StringBuffer(delim.length() + illegal.length);
sb.append(illegal).append(FileManager.DELIMITERS);
StringTokenizer st = new StringTokenizer(fileName, sb.toString());
while (st.hasMoreTokens()) {
final String currToken = st.nextToken().toLowerCase();
try {
//Ignore if a number
//(will trigger NumberFormatException if not)
Double.valueOf(currToken);
if (!allowNumbers) {
continue;
}
} catch (NumberFormatException normalWord) {
}
if (!TRIVIAL_WORDS.contains(currToken))
ret.add(currToken);
}
return ret;
}
/**
* Strips an extension off of a file's filename.
*/
public static String ripExtension(String fileName) {
String retString = null;
int extStart = fileName.lastIndexOf('.');
if (extStart == -1)
retString = fileName;
else
retString = fileName.substring(0, extStart);
return retString;
}
//Unit tests: tests/com/limegroup/gnutella/util/StringUtils
}