package com.limegroup.gnutella.spam; import java.util.HashSet; import java.util.Locale; import java.util.Map; import java.util.Set; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.limewire.io.Address; import org.limewire.io.Connectable; import org.limewire.io.IpPort; import org.limewire.io.NetworkInstanceUtils; import org.limewire.util.Base32; import org.limewire.util.FileUtils; import com.google.inject.Inject; import com.google.inject.Singleton; import com.limegroup.gnutella.RemoteFileDesc; import com.limegroup.gnutella.Response; import com.limegroup.gnutella.ResponseVerifier; import com.limegroup.gnutella.URN; import com.limegroup.gnutella.messages.BadPacketException; import com.limegroup.gnutella.messages.QueryReply; import com.limegroup.gnutella.messages.QueryRequest; import com.limegroup.gnutella.util.QueryUtils; import com.limegroup.gnutella.xml.LimeXMLDocument; import com.limegroup.gnutella.xml.LimeXMLNames; import com.limegroup.gnutella.xml.XMLStringUtils; /** * This class splits a RemoteFileDesc or a QueryRequest into tokens that will be * put into the RatingTable. * <p> * Currently, it extracts the following data to build a token: * <ul> * <li>keywords from the file name or query string</li> * <li>name/value pairs from the XML metadata (if any)</li> * <li>file urn (if any)</li> * <li>file size</li> * <li>address (but not port) of the sender</li> * </ul> * * The vendor string is no longer used, since it's too easy for spammers to * forge. */ @Singleton public class Tokenizer { private static final Log LOG = LogFactory.getLog(Tokenizer.class); /** * The maximum length of a keyword in chars; keywords longer than this will * be truncated. We use chars rather than bytes to avoid corrupting * multi-byte chars when truncating */ private int MAX_KEYWORD_LENGTH = 8; private final NetworkInstanceUtils networkInstanceUtils; private final ResponseVerifier responseVerifier; private final TemplateHashTokenFactory templateHashTokenFactory; @Inject Tokenizer(NetworkInstanceUtils networkInstanceUtils, ResponseVerifier responseVerifier, TemplateHashTokenFactory templateHashTokenFactory) { this.networkInstanceUtils = networkInstanceUtils; this.responseVerifier = responseVerifier; this.templateHashTokenFactory = templateHashTokenFactory; } /** * Extracts a set of tokens from a RemoteFileDesc. * * @param desc the RemoteFileDesc that should be tokenized * @return a non-empty set of Tokens */ public Set<Token> getTokens(RemoteFileDesc desc) { Set<Token> set = new HashSet<Token>(); tokenize(desc, set); return set; } /** * Extracts a set of tokens from an array of RemoteFileDescs - useful if the * user wants to mark multiple RFDs from a TableLine as spam (or not), which * should rate each token only once. * * @param descs the array of RemoteFileDescs that should be tokenized * @return a non-empty set of Tokens */ public Set<Token> getTokens(RemoteFileDesc[] descs) { Set<Token> set = new HashSet<Token>(); for(RemoteFileDesc desc : descs) tokenize(desc, set); return set; } /** * Extracts a set of tokens from a RemoteFileDesc. * * @param desc the RemoteFileDesc that should be tokenized * @param set the set to which the tokens should be added */ private void tokenize(RemoteFileDesc desc, Set<Token> set) { if(LOG.isDebugEnabled()) { String addr = desc.getAddress().getAddressDescription(); LOG.debug("Tokenizing result from " + addr); } String name = desc.getFileName(); byte[] queryGUID = desc.getQueryGUID(); if(queryGUID != null) { String query = responseVerifier.getQueryString(queryGUID); if(query != null) { Token t = templateHashTokenFactory.create(query, name); if(t != null) set.add(t); } } getKeywordTokens(FileUtils.getFilenameNoExtension(name), set); String ext = FileUtils.getFileExtension(name); if(!ext.equals("")) set.add(new FileExtensionToken(ext)); LimeXMLDocument doc = desc.getXMLDocument(); if(doc != null) { getKeywordTokens(doc, set); String infohash = doc.getValue(LimeXMLNames.TORRENT_INFO_HASH); if(infohash != null) set.add(new UrnToken("urn:sha1:" + infohash)); } URN urn = desc.getSHA1Urn(); if(urn != null) set.add(new UrnToken(urn.toString())); set.add(new SizeToken(desc.getSize())); set.add(new ApproximateSizeToken(desc.getSize())); // Ignore friend addresses and private addresses such as 192.168.x.x Address address = desc.getAddress(); if(address instanceof Connectable) { Connectable connectable = (Connectable)address; if(!networkInstanceUtils.isPrivateAddress(connectable.getInetAddress())) set.add(new AddressToken(connectable.getAddress())); } set.add(new ClientGUIDToken(Base32.encode(desc.getClientGUID()))); } /** * Tokenizes a QueryReply. Keywords from the filenames and XML metadata are * ignored, but templates are extracted from the filenames. * * @param qr the QueryReply that should be tokenized * @return a non-empty set of Tokens */ public Set<Token> getNonKeywordTokens(QueryReply qr) { if(LOG.isDebugEnabled()) LOG.debug("Tokenizing query reply from " + qr.getIP()); Set<Token> set = new HashSet<Token>(); String query = responseVerifier.getQueryString(qr.getGUID()); // Client GUID set.add(new ClientGUIDToken(Base32.encode(qr.getClientGUID()))); // Responder's address, unless private String ip = qr.getIP(); if(!networkInstanceUtils.isPrivateAddress(ip)) set.add(new AddressToken(ip)); try { for(Response r : qr.getResultsArray()) { // Template if(query != null) { Token t = templateHashTokenFactory.create(query, r.getName()); if(t != null) set.add(t); } // URNs for(URN urn : r.getUrns()) set.add(new UrnToken(urn.toString())); LimeXMLDocument doc = r.getDocument(); if(doc != null) { String infohash = doc.getValue(LimeXMLNames.TORRENT_INFO_HASH); if(infohash != null) set.add(new UrnToken("urn:sha1:" + infohash)); } // File sizes long size = r.getSize(); set.add(new SizeToken(size)); set.add(new ApproximateSizeToken(size)); // Alt-loc addresses, unless private for(IpPort ipp : r.getLocations()) { ip = ipp.getInetAddress().getHostAddress(); if(!networkInstanceUtils.isPrivateAddress(ip)) set.add(new AddressToken(ip)); } } } catch(BadPacketException ignored) {} return set; } /** * Tokenizes a QueryRequest, including the search terms, XML metadata and * URN (if any) - we clear the spam ratings of search tokens and ignore them * for spam rating purposes for the rest of the session. * * @param qr the QueryRequest that should be tokenized * @return a set of Tokens, may be empty */ public Set<Token> getTokens(QueryRequest qr) { if(LOG.isDebugEnabled()) LOG.debug("Tokenizing " + qr); Set<Token> set = new HashSet<Token>(); getKeywordTokens(qr.getQuery(), set); LimeXMLDocument xml = qr.getRichQuery(); if(xml != null) getKeywordTokens(xml, set); Set<URN> urns = qr.getQueryUrns(); for(URN urn : urns) set.add(new UrnToken(urn.toString())); return set; } /** * Extracts KeywordTokens from an XML metadata document. * * @param doc the LimeXMLDocument that should be tokenized * @param set the set to which the tokens should be added */ private void getKeywordTokens(LimeXMLDocument doc, Set<Token> set) { for(Map.Entry<String, String> entry : doc.getNameValueSet()) { String name = entry.getKey().toString(); String value = entry.getValue().toString(); getXMLKeywords(name, value, set); } } /** * Extracts XMLKeywordTokens from the field name and value of an XML * metadata item. * * @param name the field name as a String (eg audios_audio_bitrate) * @param value the value as a String * @param set the set to which the tokens should be added */ private void getXMLKeywords(String name, String value, Set<Token> set) { name = extractSimpleFieldName(name); name.toLowerCase(Locale.US); value.toLowerCase(Locale.US); for(String keyword : QueryUtils.extractKeywords(value, false)) { if(keyword.length() > MAX_KEYWORD_LENGTH) keyword = keyword.substring(0, MAX_KEYWORD_LENGTH); set.add(new XMLKeywordToken(name, keyword)); } } /** * Extracts the last part of the field name for a canonical field name (eg * audios_audio_bitrate becomes bitrate). * * @param name the canonical field name * @return the last part of the canonical field name */ private String extractSimpleFieldName(String name) { int idx1 = name.lastIndexOf(XMLStringUtils.DELIMITER); int idx2 = name.lastIndexOf(XMLStringUtils.DELIMITER, idx1 - 1); return name.substring(idx2 + XMLStringUtils.DELIMITER.length(), idx1); } /** * Splits a String into keyword tokens using QueryUtils.extractKeywords(). * * @param str the String to tokenize * @param set the set to which the tokens should be added */ private void getKeywordTokens(String str, Set<Token> set) { str.toLowerCase(Locale.US); for(String keyword : QueryUtils.extractKeywords(str, false)) { if(keyword.length() > MAX_KEYWORD_LENGTH) keyword = keyword.substring(0, MAX_KEYWORD_LENGTH); set.add(new KeywordToken(keyword)); } } }