package com.limegroup.gnutella.util; import java.text.Collator; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.HashSet; import java.util.Iterator; import java.util.LinkedHashSet; import java.util.List; import java.util.Locale; import java.util.Set; import java.util.StringTokenizer; import java.util.Vector; import com.limegroup.gnutella.Assert; import com.limegroup.gnutella.Constants; import com.limegroup.gnutella.FileManager; import com.limegroup.gnutella.settings.ApplicationSettings; import com.limegroup.gnutella.settings.SearchSettings; /** Various static routines for manipulating strings.*/ public class StringUtils { /** * Trivial words that are not considered keywords. */ private static final List TRIVIAL_WORDS; /** * Collator used for internationalization. */ private final static Collator COLLATOR; static { TRIVIAL_WORDS = new ArrayList(3); TRIVIAL_WORDS.add("the"); //must be lower-case TRIVIAL_WORDS.add("an"); TRIVIAL_WORDS.add("a"); TRIVIAL_WORDS.add("and"); COLLATOR = Collator.getInstance (new Locale(ApplicationSettings.LANGUAGE.getValue(), ApplicationSettings.COUNTRY.getValue(), ApplicationSettings.LOCALE_VARIANT.getValue())); COLLATOR.setDecomposition(Collator.FULL_DECOMPOSITION); COLLATOR.setStrength(Collator.PRIMARY); } /** Returns true if input contains the given pattern, which may contain the * wildcard character '*'. TODO: need more formal definition. Examples: * * <pre> * StringUtils.contains("", "") ==> true * StringUtils.contains("abc", "") ==> true * StringUtils.contains("abc", "b") ==> true * StringUtils.contains("abc", "d") ==> false * StringUtils.contains("abcd", "a*d") ==> true * StringUtils.contains("abcd", "*a**d*") ==> true * StringUtils.contains("abcd", "d*a") ==> false * </pre> */ public static final boolean contains(String input, String pattern) { return contains(input, pattern, false); } /** Exactly like contains(input, pattern), but case is ignored if * ignoreCase==true. */ public static final boolean contains(String input, String pattern, boolean ignoreCase) { //More efficient algorithms are possible, e.g. a modified version of the //Rabin-Karp algorithm, but they are unlikely to be faster with such //short strings. Also, some contant time factors could be shaved by //combining the second FOR loop below with the subset(..) call, but that //just isn't important. The important thing is to avoid needless //allocations. final int n=pattern.length(); //Where to resume searching after last wildcard, e.g., just past //the last match in input. int last=0; //For each token in pattern starting at i... for (int i=0; i<n; ) { //1. Find the smallest j>i s.t. pattern[j] is space, *, or +. char c=' '; int j=i; for ( ; j<n; j++) { char c2=pattern.charAt(j); if (c2==' ' || c2=='+' || c2=='*') { c=c2; break; } } //2. Match pattern[i..j-1] against input[last...]. int k=subset(pattern, i, j, input, last, ignoreCase); if (k<0) return false; //3. Reset the starting search index if got ' ' or '+'. //Otherwise increment past the match in input. if (c==' ' || c=='+') last=0; else if (c=='*') last=k+j-i; i=j+1; } return true; } public static boolean containsCharacters(String input, char [] chars) { char [] inputChars = input.toCharArray(); Arrays.sort(inputChars); for(int i=0; i<chars.length; i++) { if(Arrays.binarySearch(inputChars, chars[i]) >= 0) return true; } return false; } /** * @requires TODO3: fill this in * @effects returns the the smallest i>=bigStart * s.t. little[littleStart...littleStop-1] is a prefix of big[i...] * or -1 if no such i exists. If ignoreCase==false, case doesn't matter * when comparing characters. */ private static final int subset(String little, int littleStart, int littleStop, String big, int bigStart, boolean ignoreCase) { //Equivalent to // return big.indexOf(little.substring(littleStart, littleStop), bigStart); //but without an allocation. //Note special case for ignoreCase below. if (ignoreCase) { final int n=big.length()-(littleStop-littleStart)+1; outerLoop: for (int i=bigStart; i<n; i++) { //Check if little[littleStart...littleStop-1] matches with shift i final int n2=littleStop-littleStart; for (int j=0 ; j<n2 ; j++) { char c1=big.charAt(i+j); char c2=little.charAt(littleStart+j); if (c1!=c2 && c1!=toOtherCase(c2)) //Ignore case. See below. continue outerLoop; } return i; } return -1; } else { final int n=big.length()-(littleStop-littleStart)+1; outerLoop: for (int i=bigStart; i<n; i++) { final int n2=littleStop-littleStart; for (int j=0 ; j<n2 ; j++) { char c1=big.charAt(i+j); char c2=little.charAt(littleStart+j); if (c1!=c2) //Consider case. See above. continue outerLoop; } return i; } return -1; } } /** If c is a lower case ASCII character, returns Character.toUpperCase(c). * Else if c is an upper case ASCII character, returns Character.toLowerCase(c), * Else returns c. * Note that this is <b>not internationalized</b>; but it is fast. */ public static final char toOtherCase(char c) { int i=(int)c; final int A=(int)'A'; //65 final int Z=(int)'Z'; //90 final int a=(int)'a'; //97 final int z=(int)'z'; //122 final int SHIFT=a-A; if (i<A) //non alphabetic return c; else if (i<=Z) //upper-case return (char)(i+SHIFT); else if (i<a) //non alphabetic return c; else if (i<=z) //lower-case return (char)(i-SHIFT); else //non alphabetic return c; } /** * Exactly like split(s, Character.toString(delimiter)) */ public static String[] split(String s, char delimiter) { //Character.toString only available in Java 1.4+ return split(s, delimiter+""); } /** * Returns the tokens of s delimited by the given delimiter, without * returning the delimiter. Repeated sequences of delimiters are treated * as one. Examples: * <pre> * split("a//b/ c /","/")=={"a","b"," c "} * split("a b", "/")=={"a b"}. * split("///", "/")=={}. * </pre> * * <b>Note that whitespace is preserved if it is not part of the delimiter.</b> * An older version of this trim()'ed each token of whitespace. */ public static String[] split(String s, String delimiters) { //Tokenize s based on delimiters, adding to buffer. StringTokenizer tokenizer = new StringTokenizer(s, delimiters); Vector buf = new Vector(); while (tokenizer.hasMoreTokens()) buf.add(tokenizer.nextToken()); //Copy from buffer to array. String[] ret = new String[buf.size()]; for(int i=0; i<buf.size(); i++) ret[i] = (String)buf.get(i); return ret; } /** * Exactly like splitNoCoalesce(s, Character.toString(delimiter)) */ public static String[] splitNoCoalesce(String s, char delimiter) { //Character.toString only available in Java 1.4+ return splitNoCoalesce(s, delimiter+""); } /** * Similar to split(s, delimiters) except that subsequent delimiters are not * coalesced, so the returned array may contain empty strings. If s starts * (ends) with a delimiter, the returned array starts (ends) with an empty * strings. If s contains N delimiters, N+1 strings are always returned. * Examples: * * <pre> * split("a//b/ c /","/")=={"a","","b"," c ", ""} * split("a b", "/")=={"a b"}. * split("///", "/")=={"","","",""}. * </pre> * * @return an array A s.t. s.equals(A[0]+d0+A[1]+d1+...+A[N]), where * for all dI, dI.size()==1 && delimiters.indexOf(dI)>=0; and for * all c in A[i], delimiters.indexOf(c)<0 */ public static String[] splitNoCoalesce(String s, String delimiters) { //Tokenize s based on delimiters, adding to buffer. StringTokenizer tokenizer = new StringTokenizer(s, delimiters, true); Vector buf = new Vector(); //True if last token was a delimiter. Initialized to true to force //an empty string if s starts with a delimiter. boolean gotDelimiter=true; while (tokenizer.hasMoreTokens()) { String token=tokenizer.nextToken(); //Is token a delimiter? if (token.length()==1 && delimiters.indexOf(token)>=0) { //If so, add blank only if last token was a delimiter. if (gotDelimiter) buf.add(""); gotDelimiter=true; } else { //If not, add "real" token. buf.add(token); gotDelimiter=false; } } //Add trailing empty string UNLESS s is the empty string. if (gotDelimiter && !buf.isEmpty()) buf.add(""); //Copy from buffer to array. String[] ret = new String[buf.size()]; for(int i=0; i<buf.size(); i++) ret[i] = (String)buf.get(i); return ret; } /** Exactly the same as s1.compareToIgnoreCase(s2), which unfortunately * doesn't exist in Java 1.1.8. */ public static int compareIgnoreCase(String s1, String s2) { //Check out String.compareTo(String) for a description of the basic //algorithm. The ignore case extension is trivial. //We need to compare both uppercase and lowercase characters because //some characters have two distinct associated upper or lower cases //or exist in title case (such as "Dz"). We start by comparing the //upper case conversion because duplicate uppercases occur less often. final int n1 = s1.length(), n2 = s2.length(); final int lim = Math.min(n1, n2); for (int k = 0; k < lim; k++) { char c1 = s1.charAt(k); char c2 = s2.charAt(k); if (c1 != c2) { // avoid conversion if characters are equal c1 = Character.toUpperCase(c1); c2 = Character.toUpperCase(c2); if (c1 != c2) { // avoid conversion if uppercases are equal c1 = Character.toLowerCase(c1); c2 = Character.toLowerCase(c2); if (c1 != c2) { return c1 - c2; } } } } return n1 - n2; } /** * This method will compare the two strings using * full decomposition and only look at primary differences * The comparision will ignore case as well as * differences like FULLWIDTH vs HALFWIDTH */ public static int compareFullPrimary(String s1, String s2) { return COLLATOR.compare(s1, s2); } /** * Returns true iff s starts with prefix, ignoring case. * @return true iff s.toUpperCase().startsWith(prefix.toUpperCase()) */ public static boolean startsWithIgnoreCase(String s, String prefix) { final int pl = prefix.length(); if (s.length() < pl) return false; for (int i = 0; i < pl; i++) { char sc = s.charAt(i); char pc = prefix.charAt(i); if (sc != pc) { sc = Character.toUpperCase(sc); pc = Character.toUpperCase(pc); if (sc != pc) { sc = Character.toLowerCase(sc); pc = Character.toLowerCase(pc); if (sc!=pc) return false; } } } return true; } /** * Returns the entries in the set in a string form, that can be used * in HTTP headers (among other purposes) * @param set The set whose entries are to be convereted to string form * @return the entries in the set in a string form. * e.g. For a collection with entries ("a", "b"), the string returned will * be "a,b" */ public static String getEntriesAsString(Collection collection){ StringBuffer buffer = new StringBuffer(); boolean isFirstEntry = true; //get the connected supernodes and pass them for(Iterator iter = collection.iterator();iter.hasNext();){ //get the next entry Object entry = iter.next(); //if the first entry that we are adding if(!isFirstEntry){ //append separator to separate the entries buffer.append(Constants.ENTRY_SEPARATOR); }else{ //unset the flag isFirstEntry = false; } //append the entry buffer.append(entry.toString()); } return buffer.toString(); } /** * Returns the entries passed in the string form as a Set fo strings * @param values The string representation of entries to be split. * The entries in the string are separated by Constants.ENTRY_SEPARATOR * @return the entries in the set form. * e.g. For string "a,b", the Set returned will have 2 entries: * "a" & "b" */ public static Set getSetofValues(String values){ Set valueSet = new HashSet(); //tokenize the values StringTokenizer st = new StringTokenizer(values, Constants.ENTRY_SEPARATOR); //add the values to the set while(st.hasMoreTokens()){ valueSet.add(st.nextToken()); } //return the set return valueSet; } /** * Replaces all occurrences of old_str in str with new_str * * @param str the String to modify * @param old_str the String to be replaced * @param new_str the String to replace old_str with * * @return the modified str. */ public static String replace(String str, String old_str, String new_str) { int o = 0; StringBuffer buf = new StringBuffer(); for (int i = str.indexOf(old_str) ; i > -1 ; i = str.indexOf(old_str, i+1)) { if (i > o ) { buf.append (str.substring(o, i)); } buf.append (new_str); o = i+old_str.length(); } buf.append (str.substring(o, str.length())); return buf.toString(); } /** * Returns a truncated string, up to the maximum number of characters */ public static String truncate(final String string, final int maxLen) { if(string.length() <= maxLen) return string; else return string.substring(0, maxLen); } /** * Helper method to obtain the starting index of a substring within another * string, ignoring their case. This method is expensive because it has * to set each character of each string to lower case before doing the * comparison. * * @param str the string in which to search for the <tt>substring</tt> * argument * @param substring the substring to search for in <tt>str</tt> * @return if the <tt>substring</tt> argument occurs as a substring within * <tt>str</tt>, then the index of the first character of the first such * substring is returned; if it does not occur as a substring, -1 is * returned */ public static int indexOfIgnoreCase(String str, String substring) { // Look for the index after the expensive conversion to lower case. return str.toLowerCase().indexOf(substring.toLowerCase()); } /** * Convenience wrapper for * {@link #createQueryString(String, boolean) createQueryString(String, false)}. * @param name * @return */ public static String createQueryString(String name) { return createQueryString(name, false); } /** * * Returns a string to be used for querying from the given name. * * @param name * @param allowNumbers whether numbers in the argument should be kept in * the result * @return */ public static String createQueryString(String name, boolean allowNumbers) { if(name == null) throw new NullPointerException("null name"); String retString = null; // normalize the name. name = I18NConvert.instance().getNorm(name); final int MAX_LEN = SearchSettings.MAX_QUERY_LENGTH.getValue(); //Get the set of keywords within the name. Set intersection = keywords(name, allowNumbers); if (intersection.size() < 1) { // nothing to extract! retString = StringUtils.removeIllegalChars(name); retString = StringUtils.truncate(retString, MAX_LEN); } else { StringBuffer sb = new StringBuffer(); int numWritten = 0; Iterator keys = intersection.iterator(); for (; keys.hasNext() && (numWritten < MAX_LEN); ) { String currKey = (String) keys.next(); // if we have space to add the keyword if ((numWritten + currKey.length()) < MAX_LEN) { if (numWritten > 0) { // add a space if we've written before sb.append(" "); numWritten++; } sb.append(currKey); // add the new keyword numWritten += currKey.length(); } } retString = sb.toString(); //one small problem - if every keyword in the filename is //greater than MAX_LEN, then the string returned will be empty. //if this happens just truncate the first word.... if (retString.equals("")) retString = StringUtils.truncate(name, MAX_LEN); } // Added a bunch of asserts to catch bugs. There is some form of // input we are not considering in our algorithms.... Assert.that(retString.length() <= MAX_LEN, "Original filename: " + name + ", converted: " + retString); Assert.that(!retString.equals(""), "Original filename: " + name); Assert.that(retString != null, "Original filename: " + name); return retString; } /** * Removes illegal characters from the name, inserting spaces instead. */ public static final String removeIllegalChars(String name) { String ret = ""; String delim = FileManager.DELIMITERS; char[] illegal = SearchSettings.ILLEGAL_CHARS.getValue(); StringBuffer sb = new StringBuffer(delim.length() + illegal.length); sb.append(illegal).append(FileManager.DELIMITERS); StringTokenizer st = new StringTokenizer(name, sb.toString()); while(st.hasMoreTokens()) ret += st.nextToken().trim() + " "; return ret.trim(); } /** * Convenience wrapper for * {@link #keywords(String, boolean) keywords(String, false)}. * @param fileName * @return */ public static final Set keywords(String fileName) { return keywords(fileName, false); } /** * Gets the keywords in this filename, seperated by delimiters & illegal * characters. * * @param fileName * @param allowNumbers whether number keywords are retained and returned * in the result set * @return */ public static final Set keywords(String fileName, boolean allowNumbers) { //Remove extension fileName = ripExtension(fileName); //Separate by whitespace and _, etc. Set ret=new LinkedHashSet(); String delim = FileManager.DELIMITERS; char[] illegal = SearchSettings.ILLEGAL_CHARS.getValue(); StringBuffer sb = new StringBuffer(delim.length() + illegal.length); sb.append(illegal).append(FileManager.DELIMITERS); StringTokenizer st = new StringTokenizer(fileName, sb.toString()); while (st.hasMoreTokens()) { final String currToken = st.nextToken().toLowerCase(); try { //Ignore if a number //(will trigger NumberFormatException if not) Double.valueOf(currToken); if (!allowNumbers) { continue; } } catch (NumberFormatException normalWord) { } if (!TRIVIAL_WORDS.contains(currToken)) ret.add(currToken); } return ret; } /** * Strips an extension off of a file's filename. */ public static String ripExtension(String fileName) { String retString = null; int extStart = fileName.lastIndexOf('.'); if (extStart == -1) retString = fileName; else retString = fileName.substring(0, extStart); return retString; } //Unit tests: tests/com/limegroup/gnutella/util/StringUtils }