Tokenizer.java example

Explorer
LimeWire-Pirate-Edition-master
package com.limegroup.gnutella.spam;

import java.util.HashSet;
import java.util.Locale;
import java.util.Map;
import java.util.Set;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.limewire.io.Address;
import org.limewire.io.Connectable;
import org.limewire.io.IpPort;
import org.limewire.io.NetworkInstanceUtils;
import org.limewire.util.Base32;
import org.limewire.util.FileUtils;

import com.google.inject.Inject;
import com.google.inject.Singleton;
import com.limegroup.gnutella.RemoteFileDesc;
import com.limegroup.gnutella.Response;
import com.limegroup.gnutella.ResponseVerifier;
import com.limegroup.gnutella.URN;
import com.limegroup.gnutella.messages.BadPacketException;
import com.limegroup.gnutella.messages.QueryReply;
import com.limegroup.gnutella.messages.QueryRequest;
import com.limegroup.gnutella.util.QueryUtils;
import com.limegroup.gnutella.xml.LimeXMLDocument;
import com.limegroup.gnutella.xml.LimeXMLNames;
import com.limegroup.gnutella.xml.XMLStringUtils;

/**
 * This class splits a RemoteFileDesc or a QueryRequest into tokens that will be
 * put into the RatingTable.
 * <p>
 * Currently, it extracts the following data to build a token:
 * <ul>
 * <li>keywords from the file name or query string</li>
 * <li>name/value pairs from the XML metadata (if any)</li>
 * <li>file urn (if any)</li>
 * <li>file size</li>
 * <li>address (but not port) of the sender</li>
 * </ul>
 * 
 * The vendor string is no longer used, since it's too easy for spammers to
 * forge.
 */
@Singleton
public class Tokenizer {
    private static final Log LOG = LogFactory.getLog(Tokenizer.class);

    /**
     * The maximum length of a keyword in chars; keywords longer than this will
     * be truncated. We use chars rather than bytes to avoid corrupting
     * multi-byte chars when truncating
     */
    private int MAX_KEYWORD_LENGTH = 8;
    private final NetworkInstanceUtils networkInstanceUtils;
    private final ResponseVerifier responseVerifier;
    private final TemplateHashTokenFactory templateHashTokenFactory;

    @Inject
    Tokenizer(NetworkInstanceUtils networkInstanceUtils,
            ResponseVerifier responseVerifier,
            TemplateHashTokenFactory templateHashTokenFactory) {
        this.networkInstanceUtils = networkInstanceUtils;
        this.responseVerifier = responseVerifier;
        this.templateHashTokenFactory = templateHashTokenFactory;
    }

    /**
     * Extracts a set of tokens from a RemoteFileDesc.
     * 
     * @param desc the RemoteFileDesc that should be tokenized
     * @return a non-empty set of Tokens
     */
    public Set<Token> getTokens(RemoteFileDesc desc) {
        Set<Token> set = new HashSet<Token>();
        tokenize(desc, set);
        return set;
    }

    /**
     * Extracts a set of tokens from an array of RemoteFileDescs - useful if the
     * user wants to mark multiple RFDs from a TableLine as spam (or not), which
     * should rate each token only once.
     * 
     * @param descs the array of RemoteFileDescs that should be tokenized
     * @return a non-empty set of Tokens
     */
    public Set<Token> getTokens(RemoteFileDesc[] descs) {
        Set<Token> set = new HashSet<Token>();
        for(RemoteFileDesc desc : descs)
            tokenize(desc, set);
        return set;
    }

    /**
     * Extracts a set of tokens from a RemoteFileDesc.
     * 
     * @param desc the RemoteFileDesc that should be tokenized
     * @param set the set to which the tokens should be added
     */
    private void tokenize(RemoteFileDesc desc, Set<Token> set) {
        if(LOG.isDebugEnabled()) {
            String addr = desc.getAddress().getAddressDescription();
            LOG.debug("Tokenizing result from " + addr);
        }
        String name = desc.getFileName();
        byte[] queryGUID = desc.getQueryGUID();
        if(queryGUID != null) {
            String query = responseVerifier.getQueryString(queryGUID);
            if(query != null) {
                Token t = templateHashTokenFactory.create(query, name);
                if(t != null)
                    set.add(t);
            }
        }
        getKeywordTokens(FileUtils.getFilenameNoExtension(name), set);
        String ext = FileUtils.getFileExtension(name);
        if(!ext.equals(""))
            set.add(new FileExtensionToken(ext));
        LimeXMLDocument doc = desc.getXMLDocument();
        if(doc != null) {
            getKeywordTokens(doc, set);
            String infohash = doc.getValue(LimeXMLNames.TORRENT_INFO_HASH);
            if(infohash != null)
                set.add(new UrnToken("urn:sha1:" + infohash));
        }
        URN urn = desc.getSHA1Urn();
        if(urn != null)
            set.add(new UrnToken(urn.toString()));
        set.add(new SizeToken(desc.getSize()));
        set.add(new ApproximateSizeToken(desc.getSize()));
        // Ignore friend addresses and private addresses such as 192.168.x.x
        Address address = desc.getAddress();
        if(address instanceof Connectable) {
            Connectable connectable = (Connectable)address;
            if(!networkInstanceUtils.isPrivateAddress(connectable.getInetAddress()))
                set.add(new AddressToken(connectable.getAddress()));
        }
        set.add(new ClientGUIDToken(Base32.encode(desc.getClientGUID())));
    }

    /**
     * Tokenizes a QueryReply. Keywords from the filenames and XML metadata are
     * ignored, but templates are extracted from the filenames.
     * 
     * @param qr the QueryReply that should be tokenized
     * @return a non-empty set of Tokens
     */
    public Set<Token> getNonKeywordTokens(QueryReply qr) {
        if(LOG.isDebugEnabled())
            LOG.debug("Tokenizing query reply from " + qr.getIP());
        Set<Token> set = new HashSet<Token>();
        String query = responseVerifier.getQueryString(qr.getGUID());
        // Client GUID
        set.add(new ClientGUIDToken(Base32.encode(qr.getClientGUID())));
        // Responder's address, unless private
        String ip = qr.getIP();
        if(!networkInstanceUtils.isPrivateAddress(ip))
            set.add(new AddressToken(ip));
        try {
            for(Response r : qr.getResultsArray()) {
                // Template
                if(query != null) {
                    Token t = templateHashTokenFactory.create(query, r.getName());
                    if(t != null)
                        set.add(t);
                }
                // URNs
                for(URN urn : r.getUrns())
                    set.add(new UrnToken(urn.toString()));
                LimeXMLDocument doc = r.getDocument();
                if(doc != null) {
                    String infohash = doc.getValue(LimeXMLNames.TORRENT_INFO_HASH);
                    if(infohash != null)
                        set.add(new UrnToken("urn:sha1:" + infohash));
                }
                // File sizes
                long size = r.getSize();
                set.add(new SizeToken(size));
                set.add(new ApproximateSizeToken(size));
                // Alt-loc addresses, unless private
                for(IpPort ipp : r.getLocations()) {
                    ip = ipp.getInetAddress().getHostAddress();
                    if(!networkInstanceUtils.isPrivateAddress(ip))
                        set.add(new AddressToken(ip));
                }
            }
        } catch(BadPacketException ignored) {}
        return set;
    }

    /**
     * Tokenizes a QueryRequest, including the search terms, XML metadata and
     * URN (if any) - we clear the spam ratings of search tokens and ignore them
     * for spam rating purposes for the rest of the session.
     * 
     * @param qr the QueryRequest that should be tokenized
     * @return a set of Tokens, may be empty
     */
    public Set<Token> getTokens(QueryRequest qr) {
        if(LOG.isDebugEnabled())
            LOG.debug("Tokenizing " + qr);
        Set<Token> set = new HashSet<Token>();
        getKeywordTokens(qr.getQuery(), set);
        LimeXMLDocument xml = qr.getRichQuery();
        if(xml != null)
            getKeywordTokens(xml, set);
        Set<URN> urns = qr.getQueryUrns();
        for(URN urn : urns)
            set.add(new UrnToken(urn.toString()));
        return set;
    }

    /**
     * Extracts KeywordTokens from an XML metadata document.
     * 
     * @param doc the LimeXMLDocument that should be tokenized
     * @param set the set to which the tokens should be added
     */
    private void getKeywordTokens(LimeXMLDocument doc, Set<Token> set) {
        for(Map.Entry<String, String> entry : doc.getNameValueSet()) {
            String name = entry.getKey().toString();
            String value = entry.getValue().toString();
            getXMLKeywords(name, value, set);
        }
    }

    /**
     * Extracts XMLKeywordTokens from the field name and value of an XML
     * metadata item.
     * 
     * @param name the field name as a String (eg audios_audio_bitrate)
     * @param value the value as a String
     * @param set the set to which the tokens should be added
     */
    private void getXMLKeywords(String name, String value, Set<Token> set) {
        name = extractSimpleFieldName(name);
        name.toLowerCase(Locale.US);
        value.toLowerCase(Locale.US);
        for(String keyword : QueryUtils.extractKeywords(value, false)) {
            if(keyword.length() > MAX_KEYWORD_LENGTH)
                keyword = keyword.substring(0, MAX_KEYWORD_LENGTH);
            set.add(new XMLKeywordToken(name, keyword));
        }
    }

    /**
     * Extracts the last part of the field name for a canonical field name (eg
     * audios_audio_bitrate becomes bitrate).
     * 
     * @param name the canonical field name
     * @return the last part of the canonical field name
     */
    private String extractSimpleFieldName(String name) {
        int idx1 = name.lastIndexOf(XMLStringUtils.DELIMITER);
        int idx2 = name.lastIndexOf(XMLStringUtils.DELIMITER, idx1 - 1);
        return name.substring(idx2 + XMLStringUtils.DELIMITER.length(), idx1);
    }

    /**
     * Splits a String into keyword tokens using QueryUtils.extractKeywords().
     * 
     * @param str the String to tokenize
     * @param set the set to which the tokens should be added
     */
    private void getKeywordTokens(String str, Set<Token> set) {
        str.toLowerCase(Locale.US);
        for(String keyword : QueryUtils.extractKeywords(str, false)) {
            if(keyword.length() > MAX_KEYWORD_LENGTH)
                keyword = keyword.substring(0, MAX_KEYWORD_LENGTH);
            set.add(new KeywordToken(keyword));
        }
    }
}