package com.limegroup.gnutella.filters;
import java.util.HashSet;
import java.util.Set;
import com.limegroup.gnutella.GUID;
import com.limegroup.gnutella.messages.Message;
import com.limegroup.gnutella.messages.PingRequest;
import com.limegroup.gnutella.messages.QueryRequest;
import com.limegroup.gnutella.util.Buffer;
import com.limegroup.gnutella.xml.LimeXMLDocument;
/**
* A spam filter that tries to eliminate duplicate packets from
* overzealous users. Since requests are not traceable, we
* have to use the following heuristics:
*
* <ul>
* <li>Two pings or queries are considered duplicates if they have similar
* GUID's, arrived within M messages of each other, and arrived not
* more than T seconds apart.
* <li>Two queries are considered duplicates if they have
* the same query string, arrived within ~N seconds of each other,
* and have the same hops counts.
* </ul>
*
* It would also be possible to special-case hops counts of zero.
*/
public class DuplicateFilter extends SpamFilter {
/**
* The number of old pings to keep in memory. If this is too small, we
* won't be filtering properly. If this is too large, lookup becomes
* expensive. Assuming 10 messages arrive per second, this allows for 1
* second worth of history.
*
* INVARIANT: BUF_SIZE>1
*/
private static final int BUF_SIZE=20;
/** a list of the GUIDs of the last pings we saw and
* their timestamps.
*
* INVARIANT: the youngest entries have largest timestamps
*/
private Buffer /* of GUIDPair */ guids=new Buffer(BUF_SIZE);
/** The time, in milliseconds, allowed between similar messages. */
private static final int GUID_LAG=500;
/**
* When comparing two messages, if the GUIDs of the two messages differ
* in more than TOLERANCE bytes, the second message will be allowed.
* if they differ in less than or equal to TOLERANCE bytes the second
* message will not be allowed thro'
*/
private static final int TOLERANCE=2;
/**
* To efficiently look up queries, we maintain a hash set of query/hops
* pairs. (A balanced tree didn't work as well.) The only problem is that
* we must expire entries from this set that are more than a few seconds
* old. We approximate this FIFO behavior by maintaining two sets of
* queries and swapping them around.<p>
*
* For the moment assume a constant stream of queries. Every Q=QUERY_LAG
* milliseconds, a query triggers the "promotion" of newQueries" to
* oldQueries. Hence youngQueries consists of queries that are up to Q
* seconds old, and oldQueries consists of queries that are up to 2*Q
* seconds old. At the time of the promotion, entries in youngQueries have
* an average age of Q/2. So the time-averaged filter window time N
* described above is (Q/2+(Q+Q/2))/2=Q. But for any given query, N may be
* as large as 2*Q and as small as Q.
*
* Things get more complicated if we don't have a steady stream of queries.
* One error would be two simply promote youngQueries when receiving the
* first query after the last promotion. This would mean, for example, that
* very slow queries exclusively for "X" would always be blocked. Hence if
* more than 2*Q seconds has elapsed since the last promotion, we simply
* clear both sets. This means that the maximum window size N can actually
* be as high as 3*Q if there is little traffic.
*/
private static final int QUERY_LAG=1500;
/** The system time when we will promote youngQueries. */
private long querySwapTime=0;
/** The system time when we will clear both sets.
* INVARIANT: queryClearTime=querySwapTime+QUERY_LAG. */
private long queryClearTime=QUERY_LAG;
/** INVARIANT: youngQueries and oldQueries are disjoint. */
private Set /* of QueryPair */ youngQueries=new HashSet();
private Set /* of QueryPair */ oldQueries=new HashSet();
/** Returns the approximate system time in milliseconds. */
private static long getTime() {
//TODO3: avoid a system call by looking at the backend heartbeat timer.
return System.currentTimeMillis();
}
////////////////////////////////////////////////////////////////////////////
public boolean allow(Message m) {
//m is allowed if
//1. it passes the GUID test and
//2. it passes the query test if it is a query request
if (! allowGUID(m))
return false;
else if (m instanceof QueryRequest)
return allowQuery((QueryRequest)m);
else
return true;
}
public boolean allowGUID(Message m) {
//Do NOT apply this filter to pongs, query replies, or pushes,
//since many of those will (legally) have the same GUID.
if (! ((m instanceof QueryRequest) || (m instanceof PingRequest)))
return true;
GUIDPair me=new GUIDPair(m.getGUID(), getTime(), m.getHops());
//Consider all messages that came in within GUID_LAG milliseconds
//of this...
int z = guids.getSize();
for(int j=0; j<z ; j++){
GUIDPair other=(GUIDPair)guids.get(j);
//The following assertion fails for mysterious reasons on the
//Macintosh. Also, it can fail if the user adjusts the clock, e.g.,
//for daylight savings time. Luckily it need not hold for the code
//to work correctly.
// Assert.that(me.time>=other.time,"Unexpected clock behavior");
if ((me.time-other.time) > GUID_LAG)
//All remaining pings have smaller timestamps.
break;
//If different hops, keep looking
if (other.hops != me.hops)
continue;
//Are the GUIDs similar?. TODO3: can optimize
int misses=0;
for (int i=0; i<me.guid.length&&misses<=TOLERANCE; i++) {
if (me.guid[i]!=other.guid[i])
misses++;
}
if (misses<=TOLERANCE) {//really close GUIDS
guids.add(me);
return false;
}
}
guids.add(me);
return true;
}
public boolean allowQuery(QueryRequest qr) {
//Update sets as needed.
long time=getTime();
if (time > querySwapTime) {
if (time <= queryClearTime) {
//A little time has passed. Promote youngQueries.
Set tmp=oldQueries;
oldQueries=youngQueries;
youngQueries=tmp;
youngQueries.clear();
} else {
//A lot of time has passed. Clear both.
youngQueries.clear();
oldQueries.clear();
}
querySwapTime=time+QUERY_LAG;
queryClearTime=querySwapTime+QUERY_LAG;
}
//Look up query in both sets. Add it to new set if not already there.
QueryPair qp=new QueryPair(qr.getQuery(),
qr.getHops(),
qr.getRichQuery(),
qr.getQueryUrns(),
qr.getMetaMask() );
if (oldQueries.contains(qp)) {
return false;
} else {
boolean added=youngQueries.add(qp);
return added; //allow if wasn't already in young set
}
}
}
final class GUIDPair {
byte[] guid;
long time;
int hops;
GUIDPair(byte[] guid, long time, int hops) {
this.guid=guid;
this.time=time;
this.hops=hops;
}
public String toString() {
return "["+(new GUID(guid)).toString()+", "+time+"]";
}
}
final class QueryPair {
String query;
int hops;
LimeXMLDocument xml;
Set URNs;
int cachedHash = 0;
int metaMask;
QueryPair(String query, int hops, LimeXMLDocument xml,
Set URNs, int metaMask) {
this.query=query;
this.hops=hops;
this.xml = xml;
this.URNs = URNs;
this.metaMask = metaMask;
}
/*
public int compareTo(Object o) {
QueryPair other=(QueryPair)o;
//Primary key: hops
//Secondary key: query
//(This may make the tree less balanced, but it results in fewer string
//comparisons.)
int ret=this.hops-other.hops;
if (ret==0)
return this.query.compareTo(other.query);
else
return ret;
} */
public boolean equals(Object o) {
if ( o == this ) return true;
if (!(o instanceof QueryPair))
return false;
QueryPair other=(QueryPair)o;
return this.hops==other.hops &&
this.metaMask == other.metaMask &&
this.URNs.equals(other.URNs) &&
this.query.equals(other.query) &&
(xml == null ? other.xml == null : xml.equals(other.xml));
}
public int hashCode() {
if ( cachedHash == 0 ) {
cachedHash = 17;
cachedHash = (37*cachedHash) + query.hashCode();
if( xml != null )
cachedHash = (37*cachedHash) + xml.hashCode();
cachedHash = (37*cachedHash) + URNs.hashCode();
cachedHash = (37*cachedHash) + hops;
cachedHash = (37*cachedHash) + metaMask;
}
return cachedHash;
}
public String toString() {
return "[\""+query+"\", "+hops+"]";
}
}