package com.limegroup.gnutella.spam; import java.io.BufferedInputStream; import java.io.BufferedOutputStream; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.io.ObjectInputStream; import java.io.ObjectOutputStream; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.Set; import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.TimeUnit; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.limewire.core.settings.FilterSettings; import org.limewire.inject.EagerSingleton; import org.limewire.io.IOUtils; import org.limewire.lifecycle.Service; import org.limewire.lifecycle.ServiceRegistry; import org.limewire.util.Base32; import org.limewire.util.CommonUtils; import org.limewire.util.GenericsUtils; import com.google.inject.Inject; import com.google.inject.name.Named; import com.limegroup.gnutella.RemoteFileDesc; import com.limegroup.gnutella.messages.QueryReply; import com.limegroup.gnutella.messages.QueryRequest; @EagerSingleton public class RatingTable implements Service { private static final Log LOG = LogFactory.getLog(RatingTable.class); /** * Don't hold more than this many entries in memory or save more than * this many entries to disk. The size is a tradeoff - tokens should be * discarded when they become irrelevant, but not before. */ private static final int MAX_SIZE = 5000; /** * Initial size of the rating table - should not be too large as many * users probably don't use the spam filter. */ private static final int INITIAL_SIZE = 100; /** * The file for storing tokens between sessions. */ private final File spamDat; /** * A map containing a limited number of tokens. We use a map rather than * a set so that we can retrieve a stored token by using an equivalent * token as a key. This allows us to use a token without rating data to * retrieve an equivalent token that has rating data. * <p> * The size of the map is limited. Entries are discarded in * least-recently-used order when the map is full, on the assumption that * the least-recently-used token is the least important to keep. Tokens * with zero ratings are not stored in the map. * <p> * LOCKING: this. */ private final Map<Token, Token> tokenMap = new LinkedHashMap<Token, Token>(INITIAL_SIZE, 0.75f, true) { // This method will be called on every get(), put(), and putAll() @Override protected boolean removeEldestEntry(Map.Entry<Token, Token> e) { if(size() > MAX_SIZE) { if(LOG.isDebugEnabled()) LOG.debug("Discarding token " + e.getValue()); return true; } return false; } }; /** * Tokens that the user has searched for during this session (could be * keywords, XML metadata, and maybe URNs in the future). They will not * contribute to the spam ratings of search results, because spammers * often echo the search terms. */ private final HashSet<Token> searchTokens = new HashSet<Token>(); /** * Whether the rating table needs to be saved. LOCKING: this. */ private boolean dirty = false; private final Tokenizer tokenizer; private final TemplateHashTokenFactory templateHashTokenFactory; private final ScheduledExecutorService backgroundExecutor; @Inject RatingTable(Tokenizer tokenizer, TemplateHashTokenFactory templateHashTokenFactory, @Named("backgroundExecutor") ScheduledExecutorService backgroundExecutor) { this.tokenizer = tokenizer; this.templateHashTokenFactory = templateHashTokenFactory; this.backgroundExecutor = backgroundExecutor; spamDat = new File(CommonUtils.getUserSettingsDir(), "spam.dat"); } @Inject void register(ServiceRegistry registry) { registry.register(this); } public String getServiceName() { return org.limewire.i18n.I18nMarker.marktr("Spam Management"); } public void initialize() { } public synchronized void start() { load(spamDat); loadSpamTokensFromSettings(); // Save the ratings every five minutes (if necessary) backgroundExecutor.scheduleWithFixedDelay(new Runnable() { @Override public void run() { save(); } }, 5, 5, TimeUnit.MINUTES); } public synchronized void stop() { save(); } synchronized void loadSpamTokensFromSettings() { if(!FilterSettings.USE_NETWORK_FILTER.getValue()) return; // Rate the received template hashes as spam for(String hash : FilterSettings.SPAM_TEMPLATES.get()) { setRatingIfUnrated(new TemplateHashToken(Base32.decode(hash)), 1f); } // Rate the received file sizes as spam for(String size : FilterSettings.SPAM_SIZES.get()) { try { setRatingIfUnrated(new ApproximateSizeToken(Long.parseLong(size)), 1f); } catch(NumberFormatException e) { LOG.debug("Error parsing file size", e); continue; } } } /** * Clears the filter data */ synchronized void clear() { LOG.debug("Clearing ratings"); tokenMap.clear(); dirty = true; } /** * Returns the rating for a RemoteFileDesc * * @param desc the RemoteFileDesc to rate * @return the rating for the RemoteFileDesc */ synchronized float getRating(RemoteFileDesc desc) { float rating = getRating(lookup(tokenizer.getTokens(desc))); if(LOG.isDebugEnabled()) { String addr = desc.getAddress().getAddressDescription(); LOG.debug("Result from " + addr + " rated " + rating); } return rating; } /** * Returns the combined rating for a set of tokens. * * @param tokens a set of tokens to be rated * @return the combined rating for the tokens */ private float getRating(Set<Token> tokens) { float rating = 1; for(Token t : tokens) rating *= 1 - t.getRating(); return 1 - rating; } /** * Assigns the given rating to an array of RemoteFileDescs. * * @param descs an array of RemoteFileDescs to be rated * @param rating a rating between 0 (not spam) and 1 (spam) */ synchronized void rate(RemoteFileDesc[] descs, float rating) { rateInternal(lookup(tokenizer.getTokens(descs)), rating); } /** * Assigns the given rating to a QueryReply, ignoring keyword tokens. * * @param qr a QueryReply to be rated * @param rating a rating between 0 (not spam) and 1 (spam) */ synchronized void rate(QueryReply qr, float rating) { rateInternal(lookup(tokenizer.getNonKeywordTokens(qr)), rating); } /** * Assigns the given rating to the given token and stores it, unless the * token is already stored, in which case the existing rating is preserved. * LOCKING: this. */ private void setRatingIfUnrated(Token t, float rating) { if(rating == 0f) return; Token stored = tokenMap.get(t); if(stored == null) { if(LOG.isDebugEnabled()) LOG.debug("Setting rating of " + t + " to " + rating); t.setRating(rating); tokenMap.put(t, t); dirty = true; } else { if(LOG.isDebugEnabled()) LOG.debug("Not replacing rating of " + t); } } /** * Clears the ratings of the tokens associated with a QueryRequest and * ignores them for the rest of the session. * * @param qr the QueryRequest to clear */ synchronized void clear(QueryRequest qr) { for(Token t : tokenizer.getTokens(qr)) { if(LOG.isDebugEnabled()) LOG.debug("Clearing search token " + t); searchTokens.add(t); // Ignore the token for this session // Clear the rating for future sessions if(tokenMap.remove(t) != null) dirty = true; } } /** * Assigns the given rating to a set of tokens, storing any that have * non-zero ratings after being updated and removing from the map any that * have zero ratings after being updated. LOCKING: this. * * @param tokens a set of tokens to be rated * @param rating a rating between 0 (not spam) and 1 (spam) */ private void rateInternal(Set<Token> tokens, float rating) { for(Token t : tokens) { float before = t.getRating(); t.updateRating(rating); float after = t.getRating(); if(LOG.isDebugEnabled()) LOG.debug(t + " was rated " + before + ", now rated " + after); if(after == 0f) tokenMap.remove(t); else tokenMap.put(t, t); dirty = true; } } /** * Replaces each token with an equivalent previously stored token, or * returns the token that was passed in if no equivalent exists. Tokens * that have been searched for during this session are not returned. * * @param tokens a set of tokens to be replaced * @return a set of equivalent tokens, with search tokens removed */ private Set<Token> lookup(Set<Token> tokens) { Set<Token> newTokens = new HashSet<Token>(); for(Token t : tokens) { if(!searchTokens.contains(t)) newTokens.add(lookup(t)); else if(LOG.isDebugEnabled()) LOG.debug("Ignoring search token " + t); } return newTokens; } /** * Returns an equivalent previously stored token if any such token exists, * otherwise returns the token that was passed in. LOCKING: this. * * @param token the token to look up * @return the same token or a previously stored equivalent */ private Token lookup(Token token) { Token stored = tokenMap.get(token); return stored == null ? token : stored; } /** * Looks up a single token and returns its rating (for testing). */ synchronized float lookupAndGetRating(Token token) { return getRating(Collections.singleton(lookup(token))); } /** * Loads ratings from disk. Package access for testing. */ void load(File file) { if(!file.exists()) { LOG.debug("No ratings to load"); return; } Map<Token, Token> temporaryMap = new HashMap<Token, Token>(); ObjectInputStream is = null; try { is = new ObjectInputStream( new BufferedInputStream( new FileInputStream(file))); List<Token> list = GenericsUtils.scanForList(is.readObject(), Token.class, GenericsUtils.ScanMode.REMOVE); int zeroes = 0, converted = 0; for(Token t : list) { // Convert old template tokens into template hash tokens if(t instanceof TemplateToken) { t = templateHashTokenFactory.convert((TemplateToken) t); converted++; } if(t.getRating() > 0f) { if(LOG.isDebugEnabled()) LOG.debug("Loading " + t + ", rated " + t.getRating()); temporaryMap.put(t, t); } else { zeroes++; } } synchronized(this) { tokenMap.clear(); tokenMap.putAll(temporaryMap); if(zeroes > 0 || converted > 0) dirty = true; if(LOG.isDebugEnabled()) { LOG.debug("Loaded " + tokenMap.size() + " entries, converted " + converted + ", skipped " + zeroes + " with zero scores"); } } } catch(IOException e) { LOG.debug("Error loading spam ratings: ", e); } catch(ClassNotFoundException e) { LOG.debug("Error loading spam ratings: ", e); } catch(ClassCastException e) { LOG.debug("Error loading spam ratings: ", e); } finally { IOUtils.close(is); } } /** * Saves ratings to disk. */ void save() { ArrayList<Token> list; synchronized(this) { if(!dirty) { LOG.debug("Ratings do not need to be saved"); return; } dirty = false; list = new ArrayList<Token>(tokenMap.size()); // The iterator returns the least-recently-used entry first for(Map.Entry<Token,Token> e : tokenMap.entrySet()) { Token t = e.getKey(); if(LOG.isDebugEnabled()) LOG.debug("Saving " + t + ", rated " + t.getRating()); list.add(t); } } ObjectOutputStream oos = null; try { oos = new ObjectOutputStream( new BufferedOutputStream( new FileOutputStream(spamDat))); oos.writeObject(list); oos.flush(); if(LOG.isDebugEnabled()) LOG.debug("Saved " + list.size() + " entries"); } catch (IOException iox) { LOG.debug("Error saving spam ratings: ", iox); } finally { IOUtils.close(oos); } } /** * @return the number of tokens in the rating table (for testing) */ synchronized int size() { return tokenMap.size(); } /** * @return the least-recently-used token in the table (for testing) */ synchronized Token getLeastRecentlyUsed() { for(Map.Entry<Token,Token> e : tokenMap.entrySet()) return e.getKey(); return null; // Empty } }