package com.github.purplepapa.Storm_Simple_Crawler;
import java.util.Map;
import backtype.storm.task.OutputCollector;
import backtype.storm.task.TopologyContext;
import backtype.storm.topology.OutputFieldsDeclarer;
import backtype.storm.topology.base.BaseRichBolt;
import backtype.storm.tuple.Tuple;
import orestes.bloomfilter.BloomFilter;
import orestes.bloomfilter.FilterBuilder;
public class URLDeduplicatorBolt extends BaseRichBolt {
/**
*
*/
private static final long serialVersionUID = -4971329030781055622L;
private OutputCollector collector;
public void prepare(Map config, TopologyContext context,
OutputCollector collector) {
this.collector = collector;
}
public void execute(Tuple tuple) {
String input = (String) tuple.getValue(0);
System.out.println("in url dedup:" + input);
String host = "localhost";
int port = 6379;
String filterName = "urlbloomfilter";
// Open a Redis-backed Bloom filter
BloomFilter<String> bfr = new FilterBuilder(1000, 0.01)
.name(filterName).redisBacked(true).redisHost(host)
.redisPort(port).buildBloomFilter();
if (!bfr.contains(input)) {
bfr.add(input);
System.out.println("NOT DUP:" + input);
} else {
System.out.println("MAY DUP:" + input);
}
collector.ack(tuple);
}
public void declareOutputFields(OutputFieldsDeclarer declarer) {
// declarer.declare(new Fields("word", "count"));
}
}