package org.calrissian.flowbox.bolt; import backtype.storm.task.OutputCollector; import backtype.storm.task.TopologyContext; import backtype.storm.topology.OutputFieldsDeclarer; import backtype.storm.topology.base.BaseRichBolt; import backtype.storm.tuple.Tuple; import backtype.storm.tuple.Values; import com.google.common.cache.Cache; import com.google.common.cache.CacheBuilder; import org.calrissian.flowbox.model.*; import org.calrissian.flowbox.support.Window; import org.calrissian.flowbox.support.WindowItem; import java.util.*; import java.util.concurrent.TimeUnit; import static com.google.common.collect.Iterables.concat; import static org.calrissian.flowbox.Constants.*; import static org.calrissian.flowbox.FlowboxFactory.declareOutputStreams; import static org.calrissian.flowbox.spout.MockFlowLoaderSpout.FLOW_LOADER_STREAM; /** * Sliding window join semantics are defined very similar to that of InfoSphere Streams. The join operator, * by default, trigger on each single input event from the stream on the right hand side. * * The stream on the right is joined with the stream on the left where the stream on the left is collected into a * window which is evicted by the given policy. The stream on the right has a default eviction policy of COUNT with * a threshold of 1. Every time a tuple on the right stream is encountered, it is compared against the window on the * left and a new tuple is emitted for each find in the join. * * By default, if no partition has been done before the join, every event received on the right stream will be joined will * be joined with every event currently in the window for the left hand stream. * * It's possible for events to have multi-valued keys, thus it's possible for merged tuples to make a single-valued key * into a multi-valued key. */ public class JoinBolt extends BaseRichBolt { Map<String, Flow> rulesMap; Map<String, Cache<String, Window>> windows; OutputCollector collector; @Override public void prepare(Map map, TopologyContext topologyContext, OutputCollector outputCollector) { this.collector = outputCollector; rulesMap = new HashMap<String, Flow>(); windows = new HashMap<String, Cache<String, Window>>(); } @Override public void execute(Tuple tuple) { /** * Update rules if necessary */ if(FLOW_LOADER_STREAM.equals(tuple.getSourceStreamId())) { Collection<Flow> rules = (Collection<Flow>) tuple.getValue(0); Set<String> rulesToRemove = new HashSet<String>(); // find deleted rules and remove them for(Flow rule : rulesMap.values()) { if(!rules.contains(rule)) rulesToRemove.add(rule.getId()); } /** * Remove any deleted rules */ for(String ruleId : rulesToRemove) { rulesMap.remove(ruleId); windows.remove(ruleId); } for(Flow rule : rules) { /** * If a rule has been updated, let's drop the window windows and start out fresh. */ if(rulesMap.get(rule.getId()) != null && !rulesMap.get(rule.getId()).equals(rule) || !rulesMap.containsKey(rule.getId())) { rulesMap.put(rule.getId(), rule); windows.remove(rule.getId()); } } } else if("tick".equals(tuple.getSourceStreamId())) { /** * Don't bother evaluating if we don't even have any rules */ if(rulesMap.size() > 0) { for(Flow rule : rulesMap.values()) { for(StreamDef stream : rule.getStreams()) { int idx = 0; for(FlowOp curOp : stream.getFlowOps()) { if(curOp instanceof JoinOp) { JoinOp op = (JoinOp) curOp; /** * If we need to trigger any time-based policies, let's do that here. */ if(op.getEvictionPolicy() == Policy.TIME) { Cache<String, Window> buffersForRule = windows.get(rule.getId() + "\0" + stream.getName() + "\0" + idx); if(buffersForRule != null) for (Window buffer : buffersForRule.asMap().values()) buffer.timeEvict(op.getEvictionThreshold()); } } idx++; } } } } } else { /** * Short circuit if we don't have any rules. */ if (rulesMap.size() > 0) { String ruleId = tuple.getStringByField(FLOW_ID); String hash = tuple.contains(PARTITION) ? tuple.getStringByField(PARTITION) : ""; Event event = (Event) tuple.getValueByField(EVENT); int idx = tuple.getIntegerByField(FLOW_OP_IDX); idx++; String streamName = tuple.getStringByField(STREAM_NAME); String previousStream = tuple.getStringByField(LAST_STREAM); Flow flow = rulesMap.get(ruleId); JoinOp op = (JoinOp) flow.getStream(streamName).getFlowOps().get(idx); // do processing on lhs if(previousStream.equals(op.getLeftStream())) { Cache<String, Window> buffersForRule = windows.get(flow.getId() + "\0" + streamName + "\0" + idx); Window buffer; if (buffersForRule != null) { buffer = buffersForRule.getIfPresent(hash); if (buffer != null) { // if we have a buffer already, process it /** * If we need to evict any buffered items, let's do it here */ if(op.getEvictionPolicy() == Policy.TIME) buffer.timeEvict(op.getEvictionThreshold()); /** * Perform count-based eviction if necessary */ else if (op.getEvictionPolicy() == Policy.COUNT) { if (buffer.size() == op.getEvictionThreshold()) buffer.expire(); } } } else { buffersForRule = CacheBuilder.newBuilder().expireAfterAccess(60, TimeUnit.MINUTES).build(); // just in case we get some rogue data, we don't wan ti to sit for too long. buffer = op.getEvictionPolicy() == Policy.TIME ? new Window(hash) : new Window(hash, op.getEvictionThreshold()); buffersForRule.put(hash, buffer); windows.put(flow.getId() + "\0" + streamName + "\0" + idx, buffersForRule); } buffer.add(event, previousStream); } else if(previousStream.equals(op.getRightStream())) { Cache<String, Window> buffersForRule = windows.get(flow.getId() + "\0" + streamName + "\0" + idx); Window buffer; if (buffersForRule != null) { buffer = buffersForRule.getIfPresent(hash); for(WindowItem bufferedEvent : buffer.getEvents()) { Event joined = new Event(bufferedEvent.getEvent().getId(), bufferedEvent.getEvent().getTimestamp()); // the hashcode will filter duplicates joined.putAll(concat(bufferedEvent.getEvent().getTuples().values())); joined.putAll(concat(event.getTuples().values())); String nextStream = idx+1 < flow.getStream(streamName).getFlowOps().size() ? flow.getStream(streamName).getFlowOps().get(idx+1).getComponentName() : "output"; if((nextStream.equals("output") && flow.getStream(streamName).isStdOutput()) || !nextStream.equals("output")) collector.emit(nextStream, new Values(flow.getId(), event, idx, streamName, bufferedEvent.getPreviousStream())); // send to any other streams that are configured (aside from output) if(nextStream.equals("output")) { if(flow.getStream(streamName).getOutputs() != null) { for(String output : flow.getStream(streamName).getOutputs()) { String outputComponent = flow.getStream(output).getFlowOps().get(0).getComponentName(); collector.emit(outputComponent, new Values(flow.getId(), event, -1, output, streamName)); } } } } } } else { throw new RuntimeException("Received event for stream that does not match the join. Flowbox has been miswired."); } } } collector.ack(tuple); } @Override public void declareOutputFields(OutputFieldsDeclarer outputFieldsDeclarer) { declareOutputStreams(outputFieldsDeclarer); } }