package com.limegroup.gnutella.spam; import java.io.BufferedInputStream; import java.io.BufferedOutputStream; import java.io.ByteArrayOutputStream; import java.io.DataOutputStream; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.io.ObjectInputStream; import java.io.ObjectOutputStream; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.Set; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.limewire.core.settings.SearchSettings; import org.limewire.inject.EagerSingleton; import org.limewire.inspection.DataCategory; import org.limewire.inspection.Inspectable; import org.limewire.inspection.InspectionPoint; import org.limewire.io.IOUtils; import org.limewire.lifecycle.Service; import org.limewire.util.CommonUtils; import org.limewire.util.GenericsUtils; import com.google.inject.Inject; import com.limegroup.gnutella.RemoteFileDesc; import com.limegroup.gnutella.messages.QueryReply; import com.limegroup.gnutella.messages.QueryRequest; @EagerSingleton public class RatingTable implements Service { private static final Log LOG = LogFactory.getLog(RatingTable.class); /** * Don't hold more than this many entries in memory or save more than * this many entries to disk. The size is a tradeoff - tokens should be * discarded when they become irrelevant, but not before. */ private static final int MAX_SIZE = 5000; /** * Initial size of the rating table - should not be too large as many * users probably don't use the spam filter. */ private static final int INITIAL_SIZE = 100; /** * A map containing a limited number of tokens. We use a map rather than * a set so that we can retrieve a stored token by using an equivalent * token as a key. This allows us to use a token without rating data to * retrieve an equivalent token that has rating data. * <p> * The size of the map is limited. Entries are discarded in * least-recently-used order when the map is full, on the assumption that * the least-recently-used token is the least important to keep. Tokens * with zero ratings are not stored in the map. */ private final Map<Token, Token> tokenMap = new LinkedHashMap<Token, Token>(INITIAL_SIZE, 0.75f, true) { // This method will be called on every get(), put(), and putAll() @Override protected boolean removeEldestEntry(Map.Entry<Token, Token> e) { if(size() > MAX_SIZE) { if(LOG.isDebugEnabled()) LOG.debug("Discarding token " + e.getValue()); return true; } return false; } }; /** * Tokens that the user has searched for during this session (could be * keywords, XML metadata, and maybe URNs in the future). They will not * contribute to the spam ratings of search results, because spammers * often echo the search terms. */ private final HashSet<Token> searchTokens = new HashSet<Token>(); private final Tokenizer tokenizer; @Inject RatingTable(Tokenizer tokenizer) { this.tokenizer = tokenizer; } @Inject void register(org.limewire.lifecycle.ServiceRegistry registry) { registry.register(this); } public String getServiceName() { return org.limewire.i18n.I18nMarker.marktr("Spam Management"); } public void initialize() { } public synchronized void start() { load(); } public synchronized void stop() { save(); } /** * Clears the filter data */ protected synchronized void clear() { LOG.debug("Clearing ratings"); tokenMap.clear(); } /** * Returns the rating for a RemoteFileDesc * * @param desc the RemoteFileDesc to rate * @return the rating for the RemoteFileDesc */ protected synchronized float getRating(RemoteFileDesc desc) { float rating = getRating(lookup(tokenizer.getTokens(desc))); if(LOG.isDebugEnabled()) { String addr = desc.getAddress().getAddressDescription(); LOG.debug("Result from " + addr + " rated " + rating); } return rating; } /** * Returns the combined rating for a set of tokens. * * @param tokens a set of tokens to be rated * @return the combined rating for the tokens */ private float getRating(Set<Token> tokens) { float rating = 1; for(Token t : tokens) rating *= 1 - t.getRating(); return 1 - rating; } /** * Assigns the given rating to an array of RemoteFileDescs. * * @param descs an array of RemoteFileDescs to be rated * @param rating a rating between 0 (not spam) and 1 (spam) */ protected synchronized void rate(RemoteFileDesc[] descs, float rating) { rateInternal(lookup(tokenizer.getTokens(descs)), rating); } /** * Assigns the given rating to a QueryReply, ignoring keyword tokens. * * @param qr a QueryReply to be rated * @param rating a rating between 0 (not spam) and 1 (spam) */ protected synchronized void rate(QueryReply qr, float rating) { rateInternal(lookup(tokenizer.getNonKeywordTokens(qr)), rating); } /** * Clears the ratings of the tokens associated with a QueryRequest and * ignores them for the rest of the session. * * @param qr the QueryRequest to clear */ protected synchronized void clear(QueryRequest qr) { for(Token t : tokenizer.getTokens(qr)) { if(LOG.isDebugEnabled()) LOG.debug("Clearing search token " + t); searchTokens.add(t); // Ignore the token for this session tokenMap.remove(t); // Clear the rating for future sessions } } /** * Assigns the given rating to a set of tokens, storing any that have * non-zero ratings after being updated and removing from the map any that * have zero ratings after being updated. * * @param tokens a set of tokens to be rated * @param rating a rating between 0 (not spam) and 1 (spam) */ private void rateInternal(Set<Token> tokens, float rating) { for(Token t : tokens) { float before = t.getRating(); t.updateRating(rating); float after = t.getRating(); if(LOG.isDebugEnabled()) LOG.debug(t + " was rated " + before + ", now rated " + after); if(after == 0f) tokenMap.remove(t); else tokenMap.put(t, t); } } /** * Replaces each token with an equivalent previously stored token, or * returns the token that was passed in if no equivalent exists. Tokens * that have been searched for during this session are not returned. * * @param tokens a set of tokens to be replaced * @return a set of equivalent tokens, with search tokens removed */ private Set<Token> lookup(Set<Token> tokens) { Set<Token> newTokens = new HashSet<Token>(); for(Token t : tokens) { if(!searchTokens.contains(t)) newTokens.add(lookup(t)); else if(LOG.isDebugEnabled()) LOG.debug("Ignoring search token " + t); } return newTokens; } /** * Returns an equivalent previously stored token if any such token exists, * otherwise returns the token that was passed in. * * @param token the token to look up * @return the same token or a previously stored equivalent */ private Token lookup(Token token) { Token stored = tokenMap.get(token); return stored == null ? token : stored; } /** * Loads ratings from disk. */ private void load() { tokenMap.clear(); ObjectInputStream is = null; try { is = new ObjectInputStream( new BufferedInputStream( new FileInputStream(getSpamDat()))); List<Token> list = GenericsUtils.scanForList(is.readObject(), Token.class, GenericsUtils.ScanMode.REMOVE); int zeroes = 0; for(Token t : list) { if(t.getRating() > 0f) { if(LOG.isDebugEnabled()) LOG.debug("Loading " + t + ", rated " + t.getRating()); tokenMap.put(t, t); } else { zeroes++; } } if(LOG.isDebugEnabled()) { LOG.debug("Loaded " + tokenMap.size() + " entries, skipped " + zeroes + " with zero scores"); } } catch(Throwable t) { LOG.debug("Error loading spam ratings: ", t); } finally { IOUtils.close(is); } } /** * Saves ratings to disk (called whenever the user marks a search result). */ public void save() { ArrayList<Token> list; synchronized(this) { list = new ArrayList<Token>(tokenMap.size()); // The iterator returns the least-recently-used entry first for(Map.Entry<Token,Token> e : tokenMap.entrySet()) { Token t = e.getKey(); if(LOG.isDebugEnabled()) LOG.debug("Saving " + t + ", rated " + t.getRating()); list.add(t); } } ObjectOutputStream oos = null; try { oos = new ObjectOutputStream( new BufferedOutputStream( new FileOutputStream(getSpamDat()))); oos.writeObject(list); oos.flush(); if(LOG.isDebugEnabled()) LOG.debug("Saved " + list.size() + " entries"); } catch (IOException iox) { LOG.debug("Error saving spam ratings: ", iox); } finally { IOUtils.close(oos); } } /** * @return the number of tokens in the rating table (for testing) */ protected int size() { return tokenMap.size(); } /** * @return the least-recently-used token in the table (for testing) */ protected Token getLeastRecentlyUsed() { for(Map.Entry<Token,Token> e : tokenMap.entrySet()) return e.getKey(); return null; // Empty } private static File getSpamDat() { return new File(CommonUtils.getUserSettingsDir(), "spam.dat"); } /** Inspectable that returns a hash and rating of the tokens */ @InspectionPoint(value = "spam rating table token hashes", category = DataCategory.USAGE) @SuppressWarnings("unused") private final Inspectable TOKEN_HASH = new Inspectable() { @Override public Object inspect() { synchronized(RatingTable.this) { Map<String, Object> m = new HashMap<String, Object>(); m.put("ver",1); final float spamThreshold = SearchSettings.FILTER_SPAM_RESULTS.getValue(); ByteArrayOutputStream baos = new ByteArrayOutputStream(); DataOutputStream daos = new DataOutputStream(baos); try { for (Token t : tokenMap.values()) { // 8 bytes per entry float rating = t.getRating(); if (rating < spamThreshold) break; daos.writeFloat(rating); daos.writeInt(t.hashCode()); } daos.flush(); daos.close(); m.put("dump", baos.toByteArray()); } catch (IOException impossible) { m.put("error", impossible.toString()); } return m; } } }; }