/* * #! * % * Copyright (C) 2014 - 2016 Humboldt-Universität zu Berlin * % * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * #_ */ package de.hub.cs.dbis.aeolus.batching; import java.util.ArrayList; import java.util.Collection; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Set; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import backtype.storm.Config; import backtype.storm.generated.GlobalStreamId; import backtype.storm.generated.Grouping; import backtype.storm.grouping.CustomStreamGrouping; import backtype.storm.task.TopologyContext; import backtype.storm.tuple.Tuple; import backtype.storm.utils.Utils; /** * {@link AbstractBatchCollector} buffers emitted tuples in batches and emits full batches. It is used by * {@link BatchSpoutOutputCollector} and {@link BatchOutputCollector}. * * {@link AbstractBatchCollector} uses {@code de.hub.cs.dbis.aeolus.batching.StormConnector} which is provided as * jar-file. This jar file need to be build manually (see folder aeolus/aeolus-storm-connector). * * @author mjsax */ public abstract class AbstractBatchCollector { protected final static Logger logger = LoggerFactory.getLogger(AbstractBatchCollector.class); /** * The sizes of the output batches for each output stream. */ private final Map<String, Integer> batchSizes; /** * The number of the attributes of the output schema. */ private final Map<String, Integer> numberOfAttributes = new HashMap<String, Integer>(); /** * The current runtime environment. */ private final TopologyContext topologyContext; /** * The ID of the producer operator which output is buffered by this {@link AbstractBatchCollector}. */ private final String componentId; /** * Maps output streams to their receivers. */ private final Map<String, List<String>> receivers = new HashMap<String, List<String>>(); /** * Contains all receivers, that use fields-grouping. */ private final Set<String> fieldsGroupingReceivers = new HashSet<String>(); /** * Maps all receivers, that use custom-grouping, to the user defined grouping. */ private final Map<String, CustomStreamGrouping> customGroupingReceivers = new HashMap<String, CustomStreamGrouping>(); /** * Stores the dop of each receiver. */ private final Map<String, Integer> numberOfReceiverTasks = new HashMap<String, Integer>(); /** * Maps each output stream to a task-id-to-batch-index map. * * Task-IDs can have arbitrary values but we need values from 0 to number-of-batches. Thus, we assign an appropriate * index value to each task-ID. */ private final Map<String, Map<Integer, Integer>> streamBatchIndexMapping = new HashMap<String, Map<Integer, Integer>>(); /** * Maps output streams to corresponding output buffers. * * The number of outputBuffers depends on the number of logical receivers as well as each logical receiver's * distribution pattern and parallelism. */ private final Map<String, Batch[]> outputBuffers = new HashMap<String, Batch[]>(); /** * Maps direct output streams to corresponding output buffers. Each consumer task has its own output buffer. */ private final Map<String, Map<Integer, Batch>> directOutputBuffers = new HashMap<String, Map<Integer, Batch>>(); /** * Assigns a "weight" to each receiver the used fields-grouping. This weight is necessary to compute the correct * index within the list of output buffers. */ private final Map<String, Integer> weights = new HashMap<String, Integer>(); /** * Creates a new {@link AbstractBatchCollector} that emits batches of size {@code batchSize}. * * @param context * The current runtime environment. * @param batchSize * The batch size to be used for all output streams. */ public AbstractBatchCollector(TopologyContext context, int batchSize) { this(context, new SingleBatchSizeHashMap(batchSize)); } /** * Creates a new {@link AbstractBatchCollector} that emits batches of different size. * * @param context * The current runtime environment. * @param batchSizes * The batch sizes for each output stream. */ public AbstractBatchCollector(TopologyContext context, Map<String, Integer> batchSizes) { this(context, new HashMap<String, Integer>(batchSizes)); } private AbstractBatchCollector(TopologyContext context, HashMap<String, Integer> batchSizes) { logger.trace("batchSizes: {}", batchSizes); this.batchSizes = batchSizes; this.topologyContext = context; this.componentId = context.getThisComponentId(); logger.trace("this-id: {}", this.componentId); // StreamId -> ReceiverId -> Grouping for(Entry<String, Map<String, Grouping>> outputStream : context.getThisTargets().entrySet()) { final String streamId = outputStream.getKey(); logger.trace("output-stream: {}", streamId); // if current stream is an Aeolus-defined direct stream, we add the user defined batch size for the stream: // ie, user-defined stream == "userStream"; current (Aeolus-defined) stream == "aeolus::userStream" // -> we take the batch size specified for "userStream" and add an entry for "aeolus::userStream" with the // same batch size value if(streamId.startsWith(BatchingOutputFieldsDeclarer.STREAM_PREFIX)) { this.batchSizes.put(streamId, this.batchSizes.get(streamId.substring(BatchingOutputFieldsDeclarer.STREAM_PREFIX.length()))); } Integer bS = this.batchSizes.get(streamId); if(bS == null || bS.intValue() <= 0) { logger.trace("batching disabled"); continue; } final Map<String, Grouping> streamReceivers = outputStream.getValue(); final int numAttributes = context.getComponentOutputFields(this.componentId, streamId).size(); this.numberOfAttributes.put(streamId, new Integer(numAttributes)); int numberOfBatches = 1; final ArrayList<String> receiverIds = new ArrayList<String>(streamReceivers.size()); this.receivers.put(streamId, receiverIds); for(Entry<String, Grouping> receiver : streamReceivers.entrySet()) { final String receiverId = receiver.getKey(); receiverIds.add(receiverId); final List<Integer> taskIds = context.getComponentTasks(receiverId); logger.trace("receiver and tasks: {} - {}", receiverId, taskIds); final Grouping receiverGrouping = receiver.getValue(); if(receiverGrouping.is_set_direct()) { logger.trace("directGrouping"); Map<Integer, Batch> outputBatches = this.directOutputBuffers.get(streamId); if(outputBatches == null) { outputBatches = new HashMap<Integer, Batch>(); this.directOutputBuffers.put(streamId, outputBatches); } for(Integer taskId : taskIds) { outputBatches.put(taskId, new Batch(this.batchSizes.get(streamId).intValue(), numAttributes)); } numberOfBatches = 0; // mark as direct output stream } else if(receiverGrouping.is_set_fields()) { // do not consider as regular fields- or custom-Grouping if emulated by directGrouping for(Entry<String, Map<String, Grouping>> outputStream2 : context.getThisTargets().entrySet()) { if(outputStream2.getKey().equals(BatchingOutputFieldsDeclarer.STREAM_PREFIX + streamId)) { final Map<String, Grouping> streamReceivers2 = outputStream2.getValue(); for(Entry<String, Grouping> receiver2 : streamReceivers2.entrySet()) { if(receiver2.getKey().equals(receiverId)) { assert (receiver2.getValue().is_set_direct()); numberOfBatches = 0; // mark as emulated via direct output stream } } } } if(numberOfBatches != 0) { // TODO we could reduce number of output buffers, if two logical consumers use the same // output fields for partitioning AND have the same dop logger.trace("fieldsGrouping"); this.fieldsGroupingReceivers.add(receiverId); this.weights.put(receiverId, new Integer(numberOfBatches)); numberOfBatches *= taskIds.size(); Map<Integer, Integer> taskToIndex = this.streamBatchIndexMapping.get(streamId); if(taskToIndex == null) { taskToIndex = new HashMap<Integer, Integer>(); this.streamBatchIndexMapping.put(streamId, taskToIndex); } int i = 0; for(Integer tId : taskIds) { taskToIndex.put(tId, new Integer(i)); ++i; } } } else if(receiverGrouping.is_set_custom_serialized()) { logger.trace("customGrouping"); CustomStreamGrouping customGrouping = (CustomStreamGrouping)Utils.deserialize(receiver.getValue() .get_custom_serialized()); customGrouping.prepare(context, new GlobalStreamId(this.componentId, streamId), taskIds); this.customGroupingReceivers.put(receiverId, customGrouping); this.numberOfReceiverTasks.put(receiverId, new Integer(taskIds.size())); } } if(numberOfBatches > 0) { // otherwise, we got a direct output stream and this.directOutputBuffers is // already set up Batch[] batches = new Batch[numberOfBatches]; for(int i = 0; i < numberOfBatches; ++i) { batches[i] = new Batch(this.batchSizes.get(streamId).intValue(), numAttributes); } this.outputBuffers.put(streamId, batches); } } } /** * Captures an regular emit call of an operator, adds the output tuple to the corresponding output buffer, and emits * the buffer if it gets filled completely during this call. * * @param streamId * The name of the output stream the tuple is appended. * @param anchors * The anchor tuples of the emitted tuple (bolts only). * @param tuple * The output tuple to be emitted. * @param messageId * The ID of the output tuple (spouts only). * * @return currently {@code null} is returned, because the receiver task IDs cannot be determined if it is only * inserted into an output batch but not actual emit happens */ public List<Integer> tupleEmit(String streamId, Collection<Tuple> anchors, List<Object> tuple, Object messageId) { Integer bS = this.batchSizes.get(streamId); if(bS == null || bS.intValue() <= 0) { return this.doEmit(streamId, anchors, tuple, messageId); } String directStream = BatchingOutputFieldsDeclarer.STREAM_PREFIX + streamId; if(this.directOutputBuffers.containsKey(directStream)) { // emulate by direct emit for(String receiverComponentId : this.receivers.get(directStream)) { final CustomStreamGrouping customGrouping = this.customGroupingReceivers.get(receiverComponentId); if(customGrouping != null) { List<Integer> taskIds = customGrouping.chooseTasks( this.numberOfReceiverTasks.get(receiverComponentId).intValue(), tuple); for(Integer taskId : taskIds) { this.tupleEmitDirect(taskId.intValue(), directStream, anchors, tuple, messageId); } } else { int taskId = StormConnector.getFieldsGroupingReceiverTaskId(this.topologyContext, this.componentId, streamId, receiverComponentId, tuple).intValue(); this.tupleEmitDirect(taskId, directStream, anchors, tuple, messageId); } } } else { // regular batching int bufferIndex = 0; final Map<Integer, Integer> taskIndex = this.streamBatchIndexMapping.get(streamId); if(taskIndex != null) { // fields grouping for at least one receiver for(String receiverComponentId : this.receivers.get(streamId)) { if(this.fieldsGroupingReceivers.contains(receiverComponentId)) { Integer taskId = StormConnector.getFieldsGroupingReceiverTaskId(this.topologyContext, this.componentId, streamId, receiverComponentId, tuple); bufferIndex += (this.weights.get(receiverComponentId).intValue() * taskIndex.get(taskId) .intValue()); } } } Batch[] streamBuffers = this.outputBuffers.get(streamId); if(streamBuffers != null) { final Batch buffer = streamBuffers[bufferIndex]; buffer.addTuple(tuple); if(buffer.isFull()) { this.doEmit(streamId, null, buffer, null); this.outputBuffers.get(streamId)[bufferIndex] = new Batch(this.batchSizes.get(streamId).intValue(), this.numberOfAttributes.get(streamId).intValue()); } } } return null; } /** * Captures an regular direct-emit call of an operator, adds the output tuple to the corresponding output buffer, * and emits the buffer if it gets filled completely during this call. * * @param taskId * The ID of the receiver task. * * @param streamId * The name of the output stream the tuple is appended. * * @param anchors * The anchor tuples of the emitted tuple (bolts only). * * @param tuple * The output tuple to be emitted. * * @param messageId * The ID of the output tuple (spouts only). */ public void tupleEmitDirect(int taskId, String streamId, Collection<Tuple> anchors, List<Object> tuple, Object messageId) { Integer bS = this.batchSizes.get(streamId); if(bS == null || bS.intValue() <= 0) { this.doEmitDirect(taskId, streamId, anchors, tuple, messageId); } Integer tid = new Integer(taskId); final Map<Integer, Batch> streamBuffers = this.directOutputBuffers.get(streamId); if(streamBuffers != null) { final Batch buffer = streamBuffers.get(tid); if(buffer != null) { buffer.addTuple(tuple); if(buffer.isFull()) { this.doEmitDirect(taskId, streamId, null, buffer, null); this.directOutputBuffers.get(streamId).put( tid, new Batch(this.batchSizes.get(streamId).intValue(), this.numberOfAttributes.get(streamId) .intValue())); } } } } /** * Emits all incomplete batches from the output buffer. */ public void flush() { for(String streamId : this.outputBuffers.keySet()) { Integer bS = this.batchSizes.get(streamId); if(bS != null && bS.intValue() > 0) { for(int i = 0; i < this.outputBuffers.get(streamId).length; ++i) { Batch batch = this.outputBuffers.get(streamId)[i]; if(!batch.isEmpty()) { this.doEmit(streamId, null, batch, null); this.outputBuffers.get(streamId)[i] = new Batch(this.batchSizes.get(streamId).intValue(), this.numberOfAttributes.get(streamId).intValue()); } } } } for(String streamId : this.directOutputBuffers.keySet()) { Integer bS = this.batchSizes.get(streamId); if(bS != null && bS.intValue() > 0) { for(Integer taskId : this.directOutputBuffers.get(streamId).keySet()) { Batch batch = this.directOutputBuffers.get(streamId).get(taskId); if(!batch.isEmpty()) { this.doEmitDirect(taskId.intValue(), streamId, null, batch, null); this.directOutputBuffers.get(streamId).put( taskId, new Batch(this.batchSizes.get(streamId).intValue(), this.numberOfAttributes.get(streamId) .intValue())); } } } } } /** * Registers the classes {@link Batch Batch.class} and {@link BatchColumn BatchColumn.class} for serialization and * deserialization. * * @param stormConfig * The storm config the which the classes should be registered to. */ public static void registerKryoClasses(Config stormConfig) { stormConfig.registerSerialization(Batch.class); stormConfig.registerSerialization(BatchColumn.class); } /** * Is called each time tuple or batch should be emitted. * * @param streamId * The name of the output stream the batch is appended. * @param anchors * The anchor tuples of the emitted batch (bolts only). * @param tupleOrBatch * The output tuple or batch to be emitted. * @param messageId * The ID of the output batch (spouts only). * * @return the task IDs that received the batch */ protected abstract List<Integer> doEmit(String streamId, Collection<Tuple> anchors, Object tupleOrBatch, Object messageId); /** * Is called each time tuple or batch should be emitted. * * @param taskId * The ID of the receiver task. * @param streamId * The name of the output stream the batch is appended. * @param anchors * The anchor tuples of the emitted batch (bolts only). * @param tupleOrBatch * The output tuple or batch to be emitted. * @param messageId * The ID of the output batch (spouts only). */ protected abstract void doEmitDirect(int taskId, String streamId, Collection<Tuple> anchors, Object tupleOrBatch, Object messageId); }