package com.limegroup.gnutella.util; import java.util.Arrays; import java.util.LinkedHashSet; import java.util.List; import java.util.Set; import java.util.StringTokenizer; import org.limewire.core.settings.SearchSettings; import org.limewire.util.I18NConvert; import org.limewire.util.StringUtils; public class QueryUtils { /** * Trivial words that are not considered keywords. */ private static final List<String> TRIVIAL_WORDS; /** * Characters used to tokenize queries and file names. */ public static final String DELIMITERS = " -._+/*()\\,"; private static final char[] DELIMITERS_CHARACTERS; /** * default set of delimiter characters AND illegal characters */ private static final String DELIMITERS_AND_ILLEGAL; static { // must be lower-case TRIVIAL_WORDS = Arrays.asList("the", "an", "a", "and"); char[] characters = DELIMITERS.toCharArray(); Arrays.sort(characters); DELIMITERS_CHARACTERS = characters; char[] illegal = SearchSettings.ILLEGAL_CHARS.get(); StringBuilder sb = new StringBuilder(DELIMITERS.length() + illegal.length); DELIMITERS_AND_ILLEGAL = sb.append(illegal).append(DELIMITERS).toString(); } /** * Gets the keywords in this filename, seperated by delimiters & illegal * characters. * * @param str String to extract keywords from * @param allowNumbers whether number keywords are retained and returned * in the result set * @return */ public static final Set<String> extractKeywords(String str, boolean allowNumbers) { //Separate by whitespace and _, etc. Set<String> ret=new LinkedHashSet<String>(); StringTokenizer st = new StringTokenizer(str, DELIMITERS_AND_ILLEGAL); while (st.hasMoreTokens()) { String currToken = st.nextToken().toLowerCase(); if(!allowNumbers) { try { Double.parseDouble(currToken); //NFE if number continue; } catch (NumberFormatException normalWord) {} } if (!TRIVIAL_WORDS.contains(currToken)) ret.add(currToken); } return ret; } /** * Convenience wrapper for * {@link #extractKeywords(String, boolean) keywords(String, false)}. * @param fileName * @return */ static final Set<String> extractKeywordsFromFileName(String fileName) { return extractKeywords(ripExtension(fileName), false); } /** * Removes illegal characters from the name, inserting spaces instead. */ public static final String removeIllegalChars(String name) { String ret = ""; String delim = QueryUtils.DELIMITERS; char[] illegal = SearchSettings.ILLEGAL_CHARS.get(); StringBuilder sb = new StringBuilder(delim.length() + illegal.length); sb.append(illegal).append(delim); StringTokenizer st = new StringTokenizer(name, sb.toString()); while(st.hasMoreTokens()) ret += st.nextToken().trim() + " "; return ret.trim(); } /** * Strips an extension off of a file's filename. */ public static String ripExtension(String fileName) { int extStart = fileName.lastIndexOf('.'); if (extStart == -1) return fileName; else return fileName.substring(0, extStart); } /** * * Returns a string to be used for querying from the given name. * * @param name * @param allowNumbers whether numbers in the argument should be kept in * the result * @return */ public static String createQueryString(String name, boolean allowNumbers) { if(name == null) throw new NullPointerException("null name"); String retString = null; name = I18NConvert.instance().getNorm(name); int maxLen = SearchSettings.MAX_QUERY_LENGTH.getValue(); //Get the set of keywords within the name. Set<String> keywords = extractKeywords(ripExtension(name), allowNumbers); if (keywords.isEmpty()) { // no suitable non-number words retString = removeIllegalChars(name); retString = StringUtils.truncate(retString, maxLen); } else { retString = constructQueryStringFromKeywords(maxLen, keywords); } // Added a bunch of asserts to catch bugs. There is some form of // input we are not considering in our algorithms.... assert retString.length() <= maxLen : "Original filename: " + name + ", converted: " + retString; if(!keywords.isEmpty()) assert !retString.equals("") : "Original filename: " + name; return retString; } /** * Constructs a space(" ") delimited query string that * must be <= maxLen from a set of keywords. * * @param maxLen * @param keywords set of keywords from which to generate the query string * @return */ public static String constructQueryStringFromKeywords(int maxLen, Set<String> keywords) { // adding keywords that fit when appended to query string field, skipping keywords that do not fit. StringBuilder queryFieldValue = new StringBuilder(); for (String keyword : keywords) { String delimIncl = (queryFieldValue.length() == 0) ? "" : " "; if ((queryFieldValue.length() + keyword.length() + delimIncl.length()) <= maxLen) { queryFieldValue.append(delimIncl); queryFieldValue.append(keyword); } } // in case the query string field is blank // All keywords are longer than queryField_LIMIT, // query string field would use maxLen chars of 1st keyword if (queryFieldValue.length() == 0) { queryFieldValue.append(StringUtils.truncate(keywords.iterator().next(), maxLen)); } return queryFieldValue.toString(); } /** * Convenience wrapper for * {@link #createQueryString(String, boolean) createQueryString(String, false)}. * @param name * @return */ public static String createQueryString(String name) { return createQueryString(name, false); } public static final boolean isDelimiter(char c) { return Arrays.binarySearch(DELIMITERS_CHARACTERS, c) >= 0; } }