/* * #! * % * Copyright (C) 2014 - 2016 Humboldt-Universität zu Berlin * % * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * #_ */ package de.hub.cs.dbis.aeolus.spouts; import java.util.Arrays; import java.util.HashMap; import java.util.List; import java.util.Map; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import backtype.storm.spout.SpoutOutputCollector; import backtype.storm.task.TopologyContext; import backtype.storm.topology.IRichSpout; import backtype.storm.topology.OutputFieldsDeclarer; import backtype.storm.tuple.Fields; import backtype.storm.tuple.Values; import de.hub.cs.dbis.aeolus.utils.StreamMerger; /** * {@link AbstractOrderedInputSpout} reads input tuples (of type {@code T}) from multiple sources (called * <em>partitions</em>) and pushes raw data into the topology. The default number of used partitions is one but can be * configured using {@link #NUMBER_OF_PARTITIONS}. The IDs of the partitions are {@code 0,...,NUMBER_OF_PARTITIONS-1}.<br/> * <br/> * Input data must be sorted in ascending timestamp order in each partition. For each successfully processed input * tuple, a single output tuple is emitted.<br/> * <br/> * <strong>Output schema:</strong> {@code <ts:}{@link Long}{@code ,rawTuple:T>}<br/> * Attribute {@code ts} contains the extracted timestamp value of the processed input tuple and {@code rawTuple} * contains the <em>complete</em> input tuple.<br/> * <br/> * {@link AbstractOrderedInputSpout} is parallelizable. If multiple input partitions are assigned to a single task, * {@link AbstractOrderedInputSpout} ensures that tuples are emitted in ascending timestamp order. In case of timestamp * duplicates, no ordering guarantee for all tuples having the same timestamp is given. * * @author Leonardo Aniello (Sapienza Università di Roma, Roma, Italy) * @author Roberto Baldoni (Sapienza Università di Roma, Roma, Italy) * @author Leonardo Querzoni (Sapienza Università di Roma, Roma, Italy) * @author mjsax */ public abstract class AbstractOrderedInputSpout<T> implements IRichSpout { private final static long serialVersionUID = 6224448887936832190L; private final static Logger logger = LoggerFactory.getLogger(AbstractOrderedInputSpout.class); /** * Can be used to specify the number of input partitions that are available (default value is one). The * configuration value is expected to be of type {@link Integer}. */ public final static String NUMBER_OF_PARTITIONS = "OrderedInputSpout.partitions"; /** The merger to be used. */ private StreamMerger<Values> merger; /** The output collector to be used. */ private SpoutOutputCollector collector; /** * The stream ID to be used for declaration of timestamp and raw tuple fields. Can be {@code null} which causes the * fields to be declared only. */ private final String streamID; /** * Creates a {@code AbstractOrderedInputSpout} which declares fields without explicit stream ID. */ public AbstractOrderedInputSpout() { this(null); } /** * Creates a {@code AbstractOrderedInputSpout} which declares fields on stream with ID {@code streamID}. * * @param streamID * the ID of the stream the fields ought to be declared on */ public AbstractOrderedInputSpout(String streamID) { this.streamID = streamID; } /** * {@inheritDoc} * * Sets up internal data structures according to the number of used partitions {@link #NUMBER_OF_PARTITIONS}. */ @Override public void open(@SuppressWarnings("rawtypes") Map conf, TopologyContext context, SpoutOutputCollector collector) { int numberOfPartitons = 1; Integer numPartitions = (Integer)conf.get(NUMBER_OF_PARTITIONS); if(numPartitions != null) { numberOfPartitons = numPartitions.intValue(); } logger.debug("Number of configured partitions: {}", new Integer(numberOfPartitons)); Integer[] partitionIds = new Integer[numberOfPartitons]; for(int i = 0; i < numberOfPartitons; ++i) { partitionIds[i] = new Integer(i); } this.merger = new StreamMerger<Values>(Arrays.asList(partitionIds), 0); this.collector = collector; } /** * Makes a new output tuple available. * * Must be used by {@link #nextTuple()} instead of an {@link SpoutOutputCollector} to emit tuples. In each call, all * tuples that are still buffered over all partitions are considered to be emitted to the default output stream. * * @param index * The partition id the tuple belongs to. * @param timestamp * The timestamp of the tuple. * @param tuple * The tuple to be emitted. * * @return A map of all tuple that got emitted during this call including the task IDs each emitted tuple was sent * to. */ // TODO: add support for non-default and/or multiple output streams (what about directEmit(...)?) protected final Map<Values, List<Integer>> emitNextTuple(Integer index, Long timestamp, T tuple) { logger.trace("Received new output tuple (partitionId, ts, tuple): {}, {}, {}", index, timestamp, tuple); if(index != null && timestamp != null && tuple != null) { this.merger.addTuple(index, new Values(timestamp, tuple)); } Values t; Map<Values, List<Integer>> emitted = new HashMap<Values, List<Integer>>(); while((t = this.merger.getNextTuple()) != null) { logger.trace("Emitting tuple: {}", t); emitted.put(t, this.collector.emit(t)); } return emitted; } /** * Closes an input partition. Closing a partition is only successful, if no tuples belonging to the partition are * buffered internally any more. No more data can be emitted by this partition if closing was successful. * * @param partitionId * The ID of the partition to be closed. * * @return {@code true} if the partition was closed successfully -- {@code false} otherwise */ protected boolean closePartition(Integer partitionId) { return this.merger.closePartition(partitionId); } /** * Declares the two fields necessary for transmitting tuples with a timestamp. Calling * {@code super.declareOutputFields} in overriding methods is strongly recommended. * * @param declarer */ @Override public void declareOutputFields(OutputFieldsDeclarer declarer) { Fields fields = new Fields("ts", "rawTuple"); if(this.streamID == null) { declarer.declare(fields); } else { declarer.declareStream(this.streamID, fields); } } }