package org.limewire.promotion; import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; import java.security.MessageDigest; import java.security.NoSuchAlgorithmException; import java.util.ArrayList; import java.util.Arrays; import java.util.Comparator; import java.util.HashSet; import java.util.List; import java.util.Set; import org.limewire.util.ByteUtils; import org.limewire.util.I18NConvert; import org.limewire.util.StringUtils; import com.google.inject.Singleton; @Singleton public final class KeywordUtilImpl implements KeywordUtil { private final Set<String> stopwords = new HashSet<String>(); public KeywordUtilImpl() { initEnglishStopwords(); } public List<String> splitKeywords(String keywords) { List<String> list = new ArrayList<String>(); for (String word : keywords.split("\t")) list.add(normalizeQuery(word)); return list; } private void initEnglishStopwords() { for (String word : new String[] { "i", "a", "s", "about", "an", "are", "as", "at", "be", "by", "com", "de", "en", "for", "from", "how", "in", "is", "it", "la", "of", "on", "or", "that", "the", "this", "to", "was", "what", "when", "where", "who", "will", "with", "und", "www" }) addEnglishStopword(word); } /** Adds a word to the instance's set of stop words. */ public void addEnglishStopword(String word) { stopwords.add(word); } String stripPunctuation(String query) { return query.replaceAll("[,.!?<>:;\\*'\"\\$\\s]", " "); } public String normalizeQuery(String query) { if (query == null) return null; query = stripPunctuation(query); query = I18NConvert.instance().getNorm(query); String[] queryArray = sortAlphabetically(query.split(" ")); queryArray = stripEnglishStopWords(queryArray); return unsplitString(queryArray); } String unsplitString(String[] words) { StringBuilder builder = new StringBuilder(); for (String word : words) builder.append(word).append(' '); return builder.substring(0, builder.length() - 1); } /** * @return an alphabetically sorted array of words. */ String[] sortAlphabetically(String[] words) { class AlphaComparator implements Comparator<String> { public int compare(String o1, String o2) { return o1.compareToIgnoreCase(o2); } } String[] sorted = new String[words.length]; System.arraycopy(words, 0, sorted, 0, words.length); Arrays.sort(sorted, new AlphaComparator()); return sorted; } /** * @return a new array with all English stop words removed, otherwise * ordered the same as the original array. If the resulting array is * less than 2 words long, the original array is returned (to * prevent us from dropping queries like "The Who") */ String[] stripEnglishStopWords(String[] words) { List<String> strippedWords = new ArrayList<String>(); for (String word : words) if (!stopwords.contains(word)) strippedWords.add(word); if (strippedWords.size() >= 2) return strippedWords.toArray(new String[strippedWords.size()]); return words; } /** * @return an sorted array of words, longest first, same length words sorted * by alpha */ String[] sortByLength(String[] words) { class LengthComparator implements Comparator<String> { public int compare(String o1, String o2) { if (o1.length() == o2.length()) return o1.compareToIgnoreCase(o2); return (o2.length() - o1.length()); } } String[] sorted = new String[words.length]; System.arraycopy(words, 0, sorted, 0, words.length); Arrays.sort(sorted, new LengthComparator()); return sorted; } public long getHashValue(String query) { query = normalizeQuery(query); final String[] words = sortByLength(query.split(" ")); query = ""; if (words.length > 0) query = words[0]; if (words.length > 1) query += " " + words[1]; final byte[] sha1 = computeSHA1(query); final byte[] hashArray = new byte[8]; System.arraycopy(sha1, 0, hashArray, 0, 8); // Make sure it's not negative (no leading bit set) hashArray[0] &= 127; return ByteUtils.beb2long(hashArray, 0, 8); } private byte[] computeSHA1(String input) { try { final MessageDigest outputSHA1 = MessageDigest.getInstance("SHA-1"); final byte[] data = new byte[64 * 1024]; // 64k Chunks InputStream in = new ByteArrayInputStream(StringUtils.toUTF8Bytes(normalizeQuery(input))); while (true) { final int bytesRead = in.read(data); if (bytesRead < 0) break; outputSHA1.update(data, 0, bytesRead); } // Done, let's compute the hash return outputSHA1.digest(); } catch (NoSuchAlgorithmException ex) { throw new RuntimeException("NoSuchAlgorithmException during computation: ", ex); } catch (IOException ex) { throw new RuntimeException("Impossible IOException during computation: ", ex); } } }