package com.limegroup.gnutella.util;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Set;
import java.util.StringTokenizer;
import org.limewire.core.settings.SearchSettings;
import org.limewire.util.I18NConvert;
import org.limewire.util.StringUtils;
public class QueryUtils {
/**
* Trivial words that are not considered keywords.
*/
private static final List<String> TRIVIAL_WORDS;
/**
* Characters used to tokenize queries and file names.
*/
public static final String DELIMITERS = " -._+/*()\\,";
private static final char[] DELIMITERS_CHARACTERS;
/**
* default set of delimiter characters AND illegal characters
*/
private static final String DELIMITERS_AND_ILLEGAL;
static {
// must be lower-case
TRIVIAL_WORDS = Arrays.asList("the", "an", "a", "and");
char[] characters = DELIMITERS.toCharArray();
Arrays.sort(characters);
DELIMITERS_CHARACTERS = characters;
char[] illegal = SearchSettings.ILLEGAL_CHARS.get();
StringBuilder sb = new StringBuilder(DELIMITERS.length() + illegal.length);
DELIMITERS_AND_ILLEGAL = sb.append(illegal).append(DELIMITERS).toString();
}
/**
* Extracts keywords from the given string, separated by delimiters and
* illegal characters, and returns them in a new set.
*
* @param str the string to extract keywords from
* @param allowNumbers whether numbers are treated as keywords
* @return a new set containing the keywords
*/
public static final Set<String> extractKeywords(String str,
boolean allowNumbers) {
Set<String> set = new LinkedHashSet<String>();
extractKeywords(str, allowNumbers, set);
return set;
}
/**
* Extracts keywords from the given string, separated by delimiters and
* illegal characters, and adds them to the supplied set.
*
* @param str the string to extract keywords from
* @param allowNumbers whether numbers are treated as keywords
* @param set the set to which the keywords should be added
*/
public static final void extractKeywords(String str, boolean allowNumbers,
Set<String> set) {
StringTokenizer st = new StringTokenizer(str, DELIMITERS_AND_ILLEGAL);
while(st.hasMoreTokens()) {
String currToken = st.nextToken().toLowerCase();
if(!allowNumbers) {
try {
Double.parseDouble(currToken); // NFE if not a number
continue;
} catch(NumberFormatException normalWord) {}
}
if(!TRIVIAL_WORDS.contains(currToken))
set.add(currToken);
}
}
/**
* Convenience wrapper for
* {@link #extractKeywords(String, boolean) keywords(String, false)}.
* @param fileName
* @return
*/
static final Set<String> extractKeywordsFromFileName(String fileName) {
return extractKeywords(ripExtension(fileName), false);
}
/**
* Removes illegal characters and delimiters from a string, inserting
* spaces instead.
*/
public static final String removeIllegalChars(String str) {
StringBuilder sb = new StringBuilder(str.length());
StringTokenizer st = new StringTokenizer(str, DELIMITERS_AND_ILLEGAL);
while(st.hasMoreTokens()) {
if(sb.length() > 0)
sb.append(' ');
sb.append(st.nextToken());
}
return sb.toString();
}
/**
* Strips an extension off of a file's filename.
*/
public static String ripExtension(String fileName) {
int extStart = fileName.lastIndexOf('.');
if (extStart == -1)
return fileName;
else
return fileName.substring(0, extStart);
}
/**
*
* Returns a string to be used for querying from the given name.
*
* @param name
* @param allowNumbers whether numbers in the argument should be kept in
* the result
* @return
*/
public static String createQueryString(String name, boolean allowNumbers) {
if(name == null)
throw new NullPointerException("null name");
String retString = null;
name = I18NConvert.instance().getNorm(name);
int maxLen = SearchSettings.MAX_QUERY_LENGTH.getValue();
//Get the set of keywords within the name.
Set<String> keywords = extractKeywords(ripExtension(name), allowNumbers);
if (keywords.isEmpty()) { // no suitable non-number words
retString = removeIllegalChars(name);
retString = StringUtils.truncate(retString, maxLen);
} else {
retString = constructQueryStringFromKeywords(maxLen, keywords);
}
// Added a bunch of asserts to catch bugs. There is some form of
// input we are not considering in our algorithms....
assert retString.length() <= maxLen : "Original filename: " + name + ", converted: " + retString;
if(!keywords.isEmpty())
assert !retString.equals("") : "Original filename: " + name;
return retString;
}
/**
* Constructs a space(" ") delimited query string that
* must be <= maxLen from a set of keywords.
*
* @param maxLen
* @param keywords set of keywords from which to generate the query string
* @return
*/
public static String constructQueryStringFromKeywords(int maxLen, Set<String> keywords) {
// adding keywords that fit when appended to query string field, skipping keywords that do not fit.
StringBuilder queryFieldValue = new StringBuilder();
for (String keyword : keywords) {
String delimIncl = (queryFieldValue.length() == 0) ? "" : " ";
if ((queryFieldValue.length() + keyword.length() + delimIncl.length())
<= maxLen) {
queryFieldValue.append(delimIncl);
queryFieldValue.append(keyword);
}
}
// in case the query string field is blank
// All keywords are longer than queryField_LIMIT,
// query string field would use maxLen chars of 1st keyword
if (queryFieldValue.length() == 0) {
queryFieldValue.append(StringUtils.truncate(keywords.iterator().next(), maxLen));
}
return queryFieldValue.toString();
}
/**
* Convenience wrapper for
* {@link #createQueryString(String, boolean) createQueryString(String, false)}.
* @param name
* @return
*/
public static String createQueryString(String name) {
return createQueryString(name, false);
}
public static final boolean isDelimiter(char c) {
return Arrays.binarySearch(DELIMITERS_CHARACTERS, c) >= 0;
}
/**
* Returns true if the filename contains all the words in the query. The
* system locale is used for converting case, so it's possible for this
* method to return false for filenames that would match under other
* locales.
*/
public static boolean filenameMatchesQuery(String filename, String query) {
if(query.isEmpty())
return true;
filename = filename.toLowerCase();
query = query.toLowerCase();
for(String queryWord : extractKeywords(query, false)) {
if(!filename.contains(queryWord))
return false;
}
return true;
}
/**
* Mutates a query string by shuffling the words and removing trivial words.
* The returned string may or may not differ from the argument.
*/
public static String mutateQuery(String query) {
ArrayList<String> words =
new ArrayList<String>(extractKeywords(query, true));
if(words.size() <= 1)
return query;
Collections.shuffle(words);
return StringUtils.explode(words, " ");
}
}