package com.limegroup.gnutella.spam; import java.util.Collections; import java.util.HashSet; import java.util.Iterator; import java.util.Locale; import java.util.Map; import java.util.Set; import java.util.StringTokenizer; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import com.limegroup.gnutella.RemoteFileDesc; import com.limegroup.gnutella.URN; import com.limegroup.gnutella.messages.QueryRequest; import com.limegroup.gnutella.xml.LimeXMLDocument; import com.limegroup.gnutella.xml.XMLStringUtils; /** * This class is an important part of the spam filter. It splits a * RemoteFileDesc (or a QueryRequest) into tokens will than be put into the * RatingTable. * * Currently, it extracts the following data from the RemoteFileDesc to build a * token: * * <ul> * <li>file size</li> * <li>file urn</li> * <li>address:port of the sender</li> * <li>all keywords in the LimeXMLDocuments that are longer than 2 bytes and * the fields they were found in</li> * <li>all keywords that are longer than 2 bytes and the first 3 bytes of the * following keyword</li> * </ul> */ public class Tokenizer { private static final Log LOG = LogFactory.getLog(Tokenizer.class); /** * the min number of bytes for a keyword, keywords shorter than this will be * ignored * * NOTE: we use the number of bytes not chars to decide whether or not to * block because e.g. in japanese many words will be only one or two chars * long. */ private static int MIN_KEYWORD_LENGTH = 3; /** * the max number of bytes for a keyword, keywords longer than this will be * truncated */ private static int MAX_KEYWORD_LENGTH = 8; /** * these are the characters used to split file names and meta-data fields * into keyword tokens. */ private static final String KEYWORD_DELIMITERS = " -._+/*()\\,\t"; private Tokenizer() { } /** * split a <tt>RemoteFileDesc</tt> into an array of <tt>Token</tt> * * @param desc * the RemoteFileDesc, that should be tokenized * @return an array of Tokens, will never be empty */ public static Token[] getTokens(RemoteFileDesc desc) { if (LOG.isDebugEnabled()) LOG.debug("tokenizing: " + desc); Set set = new HashSet(); set.addAll(getKeywordTokens(desc)); if (desc.getSHA1Urn() != null) set.add(getUrnToken(desc)); set.add(getSizeToken(desc)); set.add(getVendorToken(desc)); set.add(getAddressToken(desc)); Token[] tokens = new Token[set.size()]; tokens = (Token[]) set.toArray(tokens); return tokens; } /** * split an array of <tt>RemoteFileDesc</tt> into an array of unique * <tt>Token</tt>. This is a very useful class, if the user wants to mark * multiple RFDs from a TableLine as spam, which should rate every token for * this table line only once as spam. * * @param descs * the array of RemoteFileDesc, that should be tokenized * @return an array of Tokens, will never be empty */ public static Token[] getTokens(RemoteFileDesc[] descs) { Set set = new HashSet(); for (int i = 0; i < descs.length; i++) { if (LOG.isDebugEnabled()) LOG.debug("tokenizing: " + descs[i]); set.addAll(getKeywordTokens(descs[i])); if (descs[i].getSHA1Urn() != null) set.add(getUrnToken(descs[i])); set.add(getSizeToken(descs[i])); set.add(getVendorToken(descs[i])); set.add(getAddressToken(descs[i])); } Token[] tokens = new Token[set.size()]; tokens = (Token[]) set.toArray(tokens); return tokens; } /** * tokenizes a QueryRequest, - used to clear all ratings for keywords the * user issues a query for. * * @param qr * the <tt>QueryRequest</tt> to tokenize * @return an array of <tt>Token</tt> */ public static Token[] getTokens(QueryRequest qr) { if (LOG.isDebugEnabled()) LOG.debug("tokenizing: " + qr); Set set = new HashSet(); set.addAll(getKeywordTokens(qr)); set.addAll(getUrnTokens(qr)); Token[] tokens = new Token[set.size()]; tokens = (Token[]) set.toArray(tokens); return tokens; } /** * Builds an UrnToken for a RemoteFileDesc * * @param desc * the RFD we are tokenizing * @return a new UrnToken, built from the SHA1 urn or null, if the RFD did * not contain a URN */ private static Token getUrnToken(RemoteFileDesc desc) { if (desc.getSHA1Urn() != null) return new UrnToken(desc.getSHA1Urn()); return null; } /** * Builds an UrnToken for a QueryRequest * * @param qr * the QueryRequest we are tokenizing * @return a Set of UrnToken, built from the query urns */ private static Set getUrnTokens(QueryRequest qr) { if (qr.getQueryUrns().isEmpty()) return Collections.EMPTY_SET; Set urns = qr.getQueryUrns(); Set ret = new HashSet(); for (Iterator iter = urns.iterator(); iter.hasNext();) ret.add(new UrnToken((URN) iter.next())); return ret; } /** * Builds a SizeToken for a RemoteFileDesc * * @param desc * the RemoteFileDesc we are tokenizing * @return a new SizeToken */ private static Token getSizeToken(RemoteFileDesc desc) { return new SizeToken(desc.getSize()); } /** * Returns a (most often) previously cached token for the specific * vendor */ private static Token getVendorToken(RemoteFileDesc desc) { return VendorToken.getToken(desc.getVendor()); } /** * Builds an AddressToken for a RemoteFileDesc * * @param desc * the RemoteFileDesc we are tokenizing * @return a new AddressToken */ private static Token getAddressToken(RemoteFileDesc desc) { return new AddressToken(desc.getInetAddress().getAddress(), desc .getPort()); } /** * Builds a Set of KeywordToken for a RemoteFileDesc * * @param desc * the RemoteFileDesc we are tokenizing * @return a Set of KeywordToken and XMLKeywordToken */ private static Set getKeywordTokens(RemoteFileDesc desc) { return getKeywordTokens(desc.getFileName(), desc.getXMLDocument()); } /** * Builds a Set of KeywordToken for a QueryRequest * * @param qr * the QueryRequest we are tokenizing * @return a Set of KeywordToken and XMLKeywordToken */ private static Set getKeywordTokens(QueryRequest qr) { return getKeywordTokens(qr.getQuery(), qr.getRichQuery()); } /** * Builds a Set of KeywordToken * * @param fname * the filename that should be split into KeywordToken * @param doc * the LimeXMLDocument that should be split into XMLKeywordToken * @return a Set of XMLKeywordToken */ private static Set getKeywordTokens(String fname, LimeXMLDocument doc) { Set tokens = getKeywordTokens(fname.toLowerCase(Locale.US)); if (doc != null) { for (Iterator iter = doc.getNameValueSet().iterator(); iter .hasNext();) { Map.Entry next = (Map.Entry) iter.next(); tokens.addAll(getXMLKeywords(next.getKey().toString() .toLowerCase(Locale.US), next.getValue().toString() .toLowerCase(Locale.US))); } } return tokens; } /** * Get an XMLKeywordToken for the field-name and the value of an XML * meta-data item * * @param name * the field name as String (something like audios_audio_bitrate) * @param value * the value String * @return a Set of XMLKeywordToken */ private static Set getXMLKeywords(String name, String value) { name = extractSimpleFieldName(name); Set ret = new HashSet(); StringTokenizer tok = new StringTokenizer(value, KEYWORD_DELIMITERS); while (tok.hasMoreTokens()) { byte[] token = tok.nextToken().getBytes(); if (token.length < MIN_KEYWORD_LENGTH) continue; if (token.length > MAX_KEYWORD_LENGTH) token = truncateArray(token, MAX_KEYWORD_LENGTH); ret.add(new XMLKeywordToken(name, token)); } return ret; } /** * truncate an array of bytes * * @param array * the source array. * @param length * the length of the truncated array * @return array of the first length bytes of the source array */ private static byte[] truncateArray(byte[] array, int length) { byte[] ret = new byte[length]; System.arraycopy(array, 0, ret, 0, length); return ret; } /** * this method merges two byte arrays into one byte array, separating the * two arrays only by a single 0x00 byte. * * @param array1 * @param array2 * @return byte array */ private static byte[] mergeArrays(byte[] array1, byte[] array2) { byte[] ret = new byte[array1.length + array2.length + 1]; System.arraycopy(array1, 0, ret, 0, array1.length); ret[array1.length] = 0; System.arraycopy(array2, 0, ret, array1.length + 1, array2.length); return ret; } /** * Extract the last part of the field name for a canonical field name * (audios_audio_bitrate becomes bitrate) * * @param canonicalField * the canonical field name * @return the last part of the canonical field name */ private static String extractSimpleFieldName(String canonicalField) { int idx1 = canonicalField.lastIndexOf(XMLStringUtils.DELIMITER); int idx2 = canonicalField.lastIndexOf(XMLStringUtils.DELIMITER, idx1 - 1); return (canonicalField.substring(idx2 + XMLStringUtils.DELIMITER.length(), idx1)); } /** * splits a String into keyword tokens * * @param str * the String to tokenize * @return a Set of KeywordToken */ private static Set getKeywordTokens(String str) { Set ret = new HashSet(); StringTokenizer tok = new StringTokenizer(str, KEYWORD_DELIMITERS); byte[] last = null; while (tok.hasMoreTokens()) { byte[] next = tok.nextToken().getBytes(); if (next.length < MIN_KEYWORD_LENGTH) { if (last != null) { Token token = new KeywordToken(mergeArrays(last, next)); ret.add(token); } last = next; continue; } if (next.length > MAX_KEYWORD_LENGTH) next = truncateArray(next, MAX_KEYWORD_LENGTH); Token token = new KeywordToken(next); ret.add(token); if (last != null) { token = new KeywordToken(mergeArrays(last, next)); ret.add(token); } last = next; } return ret; } }