package us.codecraft.webmagic.scheduler; /** * @author code4crafter@gmail.com * Date: 16/12/18 * Time: 上午10:23 */ import com.google.common.hash.BloomFilter; import com.google.common.hash.Funnels; import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.scheduler.component.DuplicateRemover; import java.nio.charset.Charset; import java.util.concurrent.atomic.AtomicInteger; /** * BloomFilterDuplicateRemover for huge number of urls. * * @author code4crafer@gmail.com * @since 0.5.1 */ public class BloomFilterDuplicateRemover implements DuplicateRemover { private int expectedInsertions; private double fpp; private AtomicInteger counter; public BloomFilterDuplicateRemover(int expectedInsertions) { this(expectedInsertions, 0.01); } /** * * @param expectedInsertions the number of expected insertions to the constructed * @param fpp the desired false positive probability (must be positive and less than 1.0) */ public BloomFilterDuplicateRemover(int expectedInsertions, double fpp) { this.expectedInsertions = expectedInsertions; this.fpp = fpp; this.bloomFilter = rebuildBloomFilter(); } protected BloomFilter<CharSequence> rebuildBloomFilter() { counter = new AtomicInteger(0); return BloomFilter.create(Funnels.stringFunnel(Charset.defaultCharset()), expectedInsertions, fpp); } private final BloomFilter<CharSequence> bloomFilter; @Override public boolean isDuplicate(Request request, Task task) { boolean isDuplicate = bloomFilter.mightContain(getUrl(request)); if (!isDuplicate) { bloomFilter.put(getUrl(request)); counter.incrementAndGet(); } return isDuplicate; } protected String getUrl(Request request) { return request.getUrl(); } @Override public void resetDuplicateCheck(Task task) { rebuildBloomFilter(); } @Override public int getTotalRequestsCount(Task task) { return counter.get(); } }