package guang.crawler.siteManager.urlFilter; import java.security.MessageDigest; import java.security.NoSuchAlgorithmException; import java.util.BitSet; import org.apache.commons.codec.binary.Base64; /** * 使用位图方式过滤所有的URL。可能会有误报的情况存在。 * * @author yang * */ public class BitMapFilter implements ObjectFilter { public static BitMapFilter newFilter() throws NoSuchAlgorithmException { return new BitMapFilter(); } /** * 过滤器的位图的位的数量,共使用了2M内存 */ private final int FILTER_SIZE = 16 * 1024 * 1024; /** * HASH的方法 */ private final String DIGESTALG = "MD5"; /** * 位图 */ private BitSet filterData; /** * 对信息进行签名的工具,用来产生hash值 */ private MessageDigest digest; private BitMapFilter() throws NoSuchAlgorithmException { this.filterData = new BitSet(this.FILTER_SIZE); this.digest = MessageDigest.getInstance(this.DIGESTALG); } @Override public boolean contains(final Object object) { if (object == null) { return false; } int bitIdx = this.hashObject(object); synchronized (this.filterData) { return this.filterData.get(bitIdx); } } @Override public boolean containsAndSet(final Object object) { if (object == null) { return false; } int bitIdx = this.hashObject(object); synchronized (this.filterData) { if (this.filterData.get(bitIdx)) { return true; } else { this.filterData.set(bitIdx); return false; } } } @Override public void fromBackupString(final String dataString) { byte[] data = Base64.decodeBase64(dataString); this.filterData = BitSet.valueOf(data); } private int hashObject(final Object object) { byte[] digestData = this.digest.digest(object.toString() .getBytes()); int bitIdx = 0; bitIdx |= (digestData[7] & 0xff); bitIdx <<= 8; bitIdx |= (digestData[8] & 0xff); bitIdx <<= 8; bitIdx |= (digestData[9] & 0xff); return bitIdx; } @Override public String toBackupString() { byte[] data = this.filterData.toByteArray(); String result = Base64.encodeBase64String(data); return result; } }