package com.limegroup.gnutella.spam;
import java.util.HashSet;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.limewire.io.Address;
import org.limewire.io.Connectable;
import org.limewire.io.NetworkInstanceUtils;
import org.limewire.util.Base32;
import org.limewire.util.FileUtils;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import com.limegroup.gnutella.RemoteFileDesc;
import com.limegroup.gnutella.Response;
import com.limegroup.gnutella.URN;
import com.limegroup.gnutella.messages.BadPacketException;
import com.limegroup.gnutella.messages.QueryReply;
import com.limegroup.gnutella.messages.QueryRequest;
import com.limegroup.gnutella.util.QueryUtils;
import com.limegroup.gnutella.xml.LimeXMLDocument;
import com.limegroup.gnutella.xml.XMLStringUtils;
/**
* This class splits a RemoteFileDesc or a QueryRequest into tokens that will be
* put into the RatingTable.
* <p>
* Currently, it extracts the following data to build a token:
* <ul>
* <li>keywords from the file name or query string</li>
* <li>name/value pairs from the XML metadata (if any)</li>
* <li>file urn (if any)</li>
* <li>file size</li>
* <li>address (but not port) of the sender</li>
* </ul>
*
* The vendor string is no longer used, since it's too easy for spammers to
* forge.
*/
@Singleton
public class Tokenizer {
private static final Log LOG = LogFactory.getLog(Tokenizer.class);
/**
* The maximum length of a keyword in chars; keywords longer than this will
* be truncated. We use chars rather than bytes to avoid corrupting
* multi-byte chars when truncating
*/
private int MAX_KEYWORD_LENGTH = 8;
private final NetworkInstanceUtils networkInstanceUtils;
@Inject
Tokenizer(NetworkInstanceUtils networkInstanceUtils) {
this.networkInstanceUtils = networkInstanceUtils;
}
/**
* Extracts a set of tokens from a RemoteFileDesc.
*
* @param desc the RemoteFileDesc that should be tokenized
* @return a non-empty set of Tokens
*/
public Set<Token> getTokens(RemoteFileDesc desc) {
Set<Token> set = new HashSet<Token>();
tokenize(desc, set);
return set;
}
/**
* Extracts a set of tokens from an array of RemoteFileDescs - useful if the
* user wants to mark multiple RFDs from a TableLine as spam (or not), which
* should rate each token only once.
*
* @param descs the array of RemoteFileDescs that should be tokenized
* @return a non-empty set of Tokens
*/
public Set<Token> getTokens(RemoteFileDesc[] descs) {
Set<Token> set = new HashSet<Token>();
for (RemoteFileDesc desc : descs)
tokenize(desc, set);
return set;
}
/**
* Extracts a set of tokens from a RemoteFileDesc.
*
* @param desc the RemoteFileDesc that should be tokenized
* @param set the set to which the tokens should be added
*/
private void tokenize(RemoteFileDesc desc, Set<Token> set) {
if (LOG.isDebugEnabled()) {
String addr = desc.getAddress().getAddressDescription();
LOG.debug("Tokenizing result from " + addr);
}
String name = desc.getFileName();
getKeywordTokens(FileUtils.getFilenameNoExtension(name), set);
String ext = FileUtils.getFileExtension(name);
if (!ext.equals(""))
set.add(new FileExtensionToken(ext));
LimeXMLDocument xml = desc.getXMLDocument();
if (xml != null)
getKeywordTokens(xml, set);
URN urn = desc.getSHA1Urn();
if (urn != null)
set.add(new UrnToken(urn.toString()));
set.add(new SizeToken(desc.getSize()));
set.add(new ApproximateSizeToken(desc.getSize()));
// Ignore private addresses such as 192.168.x.x
Address address = desc.getAddress();
if (address instanceof Connectable) {
Connectable connectable = (Connectable) address;
if (!networkInstanceUtils.isPrivateAddress(connectable.getInetAddress()))
set.add(new AddressToken(connectable.getAddress()));
}
set.add(new ClientGUIDToken(Base32.encode(desc.getClientGUID())));
}
/**
* Tokenizes a QueryReply. Filenames and XML metadata are ignored.
*
* @param qr the QueryReply that should be tokenized
* @return a non-empty set of Tokens
*/
public Set<Token> getNonKeywordTokens(QueryReply qr) {
if (LOG.isDebugEnabled())
LOG.debug("Tokenizing query reply from " + qr.getIP());
Set<Token> set = new HashSet<Token>();
// Client GUID
set.add(new ClientGUIDToken(Base32.encode(qr.getClientGUID())));
// Responder's address, unless private
String ip = qr.getIP();
if (!networkInstanceUtils.isPrivateAddress(ip))
set.add(new AddressToken(ip));
try {
for (Response r : qr.getResultsArray()) {
// URNs
for (URN urn : r.getUrns())
set.add(new UrnToken(urn.toString()));
// File sizes
long size = r.getSize();
set.add(new SizeToken(size));
set.add(new ApproximateSizeToken(size));
}
} catch (BadPacketException ignored) {
}
return set;
}
/**
* Tokenizes a QueryRequest, including the search terms, XML metadata and
* URN (if any) - we clear the spam ratings of search tokens and ignore them
* for spam rating purposes for the rest of the session.
*
* @param qr the QueryRequest that should be tokenized
* @return a set of Tokens, may be empty
*/
public Set<Token> getTokens(QueryRequest qr) {
if (LOG.isDebugEnabled())
LOG.debug("Tokenizing " + qr);
Set<Token> set = new HashSet<Token>();
getKeywordTokens(qr.getQuery(), set);
LimeXMLDocument xml = qr.getRichQuery();
if (xml != null)
getKeywordTokens(xml, set);
Set<URN> urns = qr.getQueryUrns();
for (URN urn : urns)
set.add(new UrnToken(urn.toString()));
return set;
}
/**
* Extracts KeywordTokens from an XML metadata document.
*
* @param doc the LimeXMLDocument that should be tokenized
* @param set the set to which the tokens should be added
*/
private void getKeywordTokens(LimeXMLDocument doc, Set<Token> set) {
for (Map.Entry<String, String> entry : doc.getNameValueSet()) {
String name = entry.getKey().toString();
String value = entry.getValue().toString();
getXMLKeywords(name, value, set);
}
}
/**
* Extracts XMLKeywordTokens from the field name and value of an XML
* metadata item.
*
* @param name the field name as a String (eg audios_audio_bitrate)
* @param value the value as a String
* @param set the set to which the tokens should be added
*/
private void getXMLKeywords(String name, String value, Set<Token> set) {
name = extractSimpleFieldName(name);
name.toLowerCase(Locale.US);
value.toLowerCase(Locale.US);
for (String keyword : QueryUtils.extractKeywords(value, false)) {
if (keyword.length() > MAX_KEYWORD_LENGTH)
keyword = keyword.substring(0, MAX_KEYWORD_LENGTH);
set.add(new XMLKeywordToken(name, keyword));
}
}
/**
* Extracts the last part of the field name for a canonical field name (eg
* audios_audio_bitrate becomes bitrate).
*
* @param name the canonical field name
* @return the last part of the canonical field name
*/
private String extractSimpleFieldName(String name) {
int idx1 = name.lastIndexOf(XMLStringUtils.DELIMITER);
int idx2 = name.lastIndexOf(XMLStringUtils.DELIMITER, idx1 - 1);
return name.substring(idx2 + XMLStringUtils.DELIMITER.length(), idx1);
}
/**
* Splits a String into keyword tokens using QueryUtils.extractKeywords().
*
* @param str the String to tokenize
* @param set the set to which the tokens should be added
*/
private void getKeywordTokens(String str, Set<Token> set) {
str.toLowerCase(Locale.US);
for (String keyword : QueryUtils.extractKeywords(str, false)) {
if (keyword.length() > MAX_KEYWORD_LENGTH)
keyword = keyword.substring(0, MAX_KEYWORD_LENGTH);
set.add(new KeywordToken(keyword));
}
}
}