/*
* #!
* %
* Copyright (C) 2014 - 2016 Humboldt-Universität zu Berlin
* %
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
* #_
*/
package de.hub.cs.dbis.aeolus.utils;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.Map.Entry;
import java.util.NoSuchElementException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import backtype.storm.tuple.Tuple;
import backtype.storm.tuple.Values;
/**
* {@link StreamMerger} merges multiple sub-stream in ascending timestamp order. Type {@code T} is expected to be either
* {@link Tuple} (for usage in bolts) or {@link Values} (for usage in spouts).
*
* @author mjsax
*/
// TODO: make more efficient (avoid linear scan of all partitions to extract next tuple)
public class StreamMerger<T> {
private final static Logger logger = LoggerFactory.getLogger(StreamMerger.class);
/** The index of the timestamp attribute ({@code -1} if attribute name or timestamp extractor is used). */
private final int tsIndex;
/** The name of the timestamp attribute ({@code null} if attribute index or timestamp extractor is used). */
private final String tsAttributeName;
/** The extractor for the timestamp ({@code null} if attribute index or name is used). */
private final TimeStampExtractor<T> tsExtractor;
/** Input tuple buffer for merging. Contains a list of input tuples for each producer task. */
private final HashMap<Integer, LinkedList<T>> mergeBuffer = new HashMap<Integer, LinkedList<T>>();
/** Contains a ID of all disabled partitions. */
private final HashSet<Integer> disabledPartitions = new HashSet<Integer>();
/** Maximum timestamp value that was emitted already; */
private long latestTs = Long.MIN_VALUE;
/**
* Instantiates a new {@link StreamMerger}. Must be used if {@code T} is type {@link Values}.
*
* @param partitionIds
* The IDs of the sub-streams (partitions) that should be merged.
* @param tsIndex
* The index of the timestamp attribute.
*/
public StreamMerger(Collection<Integer> partitionIds, int tsIndex) {
assert (partitionIds != null);
assert (tsIndex >= 0);
logger.debug("Initializing with timestamp index: {}", new Integer(tsIndex));
this.tsIndex = tsIndex;
this.tsAttributeName = null;
this.tsExtractor = null;
this.initialize(partitionIds);
}
/**
* Instantiates a new {@link StreamMerger}. Can only be used if {@code T} is type {@link Tuple}.
*
* @param tsAttributeName
* The name of the timestamp attribute.
*/
public StreamMerger(Collection<Integer> partitionIds, String tsAttributeName) {
assert (partitionIds != null);
assert (tsAttributeName != null);
logger.debug("Initializing with timestamp attribute: {}", tsAttributeName);
this.tsIndex = -1;
this.tsAttributeName = tsAttributeName;
this.tsExtractor = null;
this.initialize(partitionIds);
}
/**
* Instantiates a new {@link StreamMerger}. Can only be used if {@code T} is type {@link Tuple}.
*
* @param tsExtractor
* The extractor for the timestamp.
*/
public StreamMerger(Collection<Integer> partitionIds, TimeStampExtractor<T> tsExtractor) {
assert (partitionIds != null);
assert (tsExtractor != null);
logger.debug("Initializing with timestamp extractor:");
this.tsIndex = -1;
this.tsAttributeName = null;
this.tsExtractor = tsExtractor;
this.initialize(partitionIds);
}
private void initialize(Collection<Integer> partitionIds) {
logger.debug("Initializing partition buffer: {}", partitionIds);
for(Integer partition : partitionIds) {
this.mergeBuffer.put(partition, new LinkedList<T>());
}
}
/**
* Adds a tuple belonging to partition {@code partitionNumber} to the internal merging buffer. Assumes, that the
* timestamp of the inserted tuple is not smaller
*/
public void addTuple(Integer partitionNumber, T t) {
logger.trace("Add tuple to buffer (partitionId, tuple): {}, {}", partitionNumber, t);
assert (partitionNumber != null);
assert (t != null);
LinkedList<T> partitionBuffer = this.mergeBuffer.get(partitionNumber);
assert (partitionBuffer != null);
assert (partitionBuffer.size() == 0 || this.getTsValue(partitionBuffer.getLast()) <= this.getTsValue(t));
partitionBuffer.addLast(t);
}
/**
* Returns the next tuple from the internal merging buffer. A tuple can be returned, if it has the same timestamp as
* the last extracted tuple. If all tuples have a larger timestamp than the last returned tuple, the tuple with the
* smallest timestamp is returned iff at least one tuple is present in each buffer.
*
* @return The next tuple in ascending timestamp order -- {@code null} if no tuple could be extracted.
*/
public T getNextTuple() {
long minTsFound = Long.MAX_VALUE;
boolean eachBufferFilled = true;
Integer minTsPartitionNumber = null;
Iterator<Entry<Integer, LinkedList<T>>> it = this.mergeBuffer.entrySet().iterator();
while(it.hasNext()) {
Entry<Integer, LinkedList<T>> partition = it.next();
LinkedList<T> partitionBuffer = partition.getValue();
try {
long ts = this.getTsValue(partitionBuffer.getFirst());
assert (ts >= this.latestTs);
if(ts == this.latestTs) {
logger.trace("Extract tuple with same timestamp (partition, tuple): {}, {}", partition.getKey(),
partitionBuffer.getFirst());
return partitionBuffer.removeFirst();
}
if(ts < minTsFound) {
minTsFound = ts;
minTsPartitionNumber = partition.getKey();
}
} catch(NoSuchElementException e) {
if(this.disabledPartitions.contains(partition.getKey())) {
logger.trace("Closing empty and disabled parition: {}", partition.getKey());
it.remove();
} else {
logger.trace("Found empty parition: {}", partition.getKey());
eachBufferFilled = false;
// no BREAK: we stay in the loop, because we still might find a tuple with equal ts value as last
// returned tuple
}
}
}
if(eachBufferFilled && minTsPartitionNumber != null) {
logger.trace("Extract tuple min timestamp (ts, partition, tuple): {}, {}, {}", new Long(minTsFound),
minTsPartitionNumber, this.mergeBuffer.get(minTsPartitionNumber).getFirst());
this.latestTs = minTsFound;
return this.mergeBuffer.get(minTsPartitionNumber).removeFirst();
}
logger.trace("Could not extract tuple.");
return null;
}
private long getTsValue(T tuple) {
if(tuple instanceof Tuple) {
Tuple t = (Tuple)tuple;
if(t.getSourceStreamId().equals(TimestampMerger.FLUSH_STREAM_ID)) {
return ((Number)t.getValue(0)).longValue();
}
if(this.tsIndex != -1) {
return ((Number)t.getValue(this.tsIndex)).longValue();
}
if(this.tsAttributeName != null) {
return ((Number)((Tuple)tuple).getValueByField(this.tsAttributeName)).longValue();
}
return this.tsExtractor.getTs(tuple);
} else {
assert (tuple instanceof Values);
return ((Number)((Values)tuple).get(this.tsIndex)).longValue();
}
}
/**
* Removes an empty partition from the internal buffer.
*
* Can be used to 'unblock' {@link StreamMerger} in case of a completely consumed partition. A empty partition
* prevents {@link #getNextTuple()} to return tuples from the remaining (non-empty) partition buffers, because it
* assumes that new data is inserted into the currently empty partition buffer later on. Hence, if it is guaranteed,
* that a partition does not yield any more data, it must be removed for further processing of the remaining
* partitions.<br/>
* <br/>
* <strong>Only empty partitions can be removed.</strong>
*
* @param partitionId
* The partition to be removed.
*
* @return {@code true} if the partition was successfully removed -- {@code false} otherwise
*/
public boolean closePartition(Integer partitionId) {
if(this.mergeBuffer.get(partitionId).size() == 0) {
logger.debug("Closing partition: {}", partitionId);
this.mergeBuffer.remove(partitionId);
return true;
}
logger.debug("Closing partition {} failed.", partitionId);
return false;
}
/**
* Returns the number of open partitions.
*
* @return the number of open partitions
*/
public int getNumberOpenPartitions() {
return this.mergeBuffer.size();
}
/**
* Disables a partition. A disabled partition cannot block the merger even if it is empty.
*
* @param partitionId
* the partition that should be disabled
*/
public void disablePartition(Integer partitionId) {
this.disabledPartitions.add(partitionId);
}
}