/*
* Copyright (c) 2011-2015 EPFL DATA Laboratory
* Copyright (c) 2014-2015 The Squall Collaboration (see NOTICE)
*
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package ch.epfl.data.squall.ewh.storm_components.stream_grouping;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import org.apache.log4j.Logger;
import backtype.storm.generated.GlobalStreamId;
import backtype.storm.grouping.CustomStreamGrouping;
import backtype.storm.task.WorkerTopologyContext;
import ch.epfl.data.squall.predicates.ComparisonPredicate;
import ch.epfl.data.squall.types.NumericType;
import ch.epfl.data.squall.utilities.DeepCopy;
import ch.epfl.data.squall.utilities.MyUtilities;
import ch.epfl.data.squall.utilities.SystemParameters;
import ch.epfl.data.squall.utilities.SystemParameters.HistogramType;
// This class is used only for connecting other components to D2Component
// Divides into contiguous key ranges, and assign keys around the boundary to multiple targets
// ranges are [beginning, end)
// To be parameterized with KeyType, we need to put some of the logic into the invoking StormComponent class
// in that case, we can replace getDistance method with compareTo
public class RangeMulticastStreamGrouping implements CustomStreamGrouping,
Serializable {
private static final long serialVersionUID = 1L;
private static Logger LOG = Logger
.getLogger(RangeMulticastStreamGrouping.class);
// the number of tasks on the level this stream grouping is sending to
private int _numTargetTasks;
private List<Integer> _targetTasks;
private List _rangeBoundaries = new ArrayList(); // (numTargetTasks - 1) of
// them
protected final Map _map;
protected NumericType _wrapper;
private ComparisonPredicate _comparison;
private int _numLastJoiners;
private HistogramType _histType;
// without multicast
public RangeMulticastStreamGrouping(Map map, NumericType wrapper,
HistogramType histType) {
_map = map;
_wrapper = wrapper;
// this constructor is always invoked, directly or indirectly
_histType = histType;
_numLastJoiners = SystemParameters.getInt(_map, "PAR_LAST_JOINERS");
if (histType != null) {
// this is in the constructor, as it requires reading files from
// local machine
_rangeBoundaries = createBoundariesFromHistogram(
histType.filePrefix(), _numLastJoiners);
}
}
// with multicast
public RangeMulticastStreamGrouping(Map map,
ComparisonPredicate comparison, NumericType wrapper,
HistogramType histType) {
this(map, wrapper, histType);
_comparison = comparison;
}
private boolean isMulticast() {
return _comparison != null;
}
@Override
public List<Integer> chooseTasks(int taskId, List<Object> stormTuple) {
final List<String> tuple = (List<String>) stormTuple.get(1);
final String tupleHash = (String) stormTuple.get(2);
if (MyUtilities.isFinalAck(tuple, _map))
// send to everyone
return _targetTasks;
else {
List<Integer> targetIndexes = chooseTargetIndex(tupleHash);
List<Integer> tupleTargetTasks = new ArrayList<Integer>();
for (int targetIndex : targetIndexes) {
tupleTargetTasks.add(_targetTasks.get(targetIndex));
}
return tupleTargetTasks;
}
}
private List<Integer> chooseTargetIndex(String tupleHash) {
List<Integer> targetIndexes = new ArrayList<Integer>();
Object hash = _wrapper.fromString(tupleHash);
if (_numTargetTasks <= 1) {
// 1 target tasks, everything goes there
targetIndexes.add(0);
} else {
if (isMulticast()) {
targetIndexes = chooseTargetIndexMulticast(hash);
} else {
targetIndexes = Arrays
.asList(chooseTargetIndexNonMulticast(hash));
}
}
return targetIndexes;
}
private int chooseTargetIndexNonMulticast(Object hash) {
return chooseTaskIndex(hash, _rangeBoundaries);
}
// numTargetTasks = rangeBoundaries.size() + 1
protected int chooseTaskIndex(Object key, List rangeBoundaries) {
List<Integer> taskIndexes = new ArrayList<Integer>();
int numTasks = rangeBoundaries.size() + 1;
if (rangeBoundaries.isEmpty()) {
// redundant for this class but not for its child
taskIndexes.add(0);
return taskIndexes.get(0);
}
// check for first and last
// d2Min and d2Max are different from firstBoundary and lastBoundary
Object firstBoundary = rangeBoundaries.get(0);
Object lastBoundary = rangeBoundaries.get(numTasks - 2);
if (_wrapper.getDistance(firstBoundary, key) > 0) {
taskIndexes.add(0);
} else if (_wrapper.getDistance(key, lastBoundary) >= 0) {
taskIndexes.add(numTasks - 1);
}
// check for others
for (int i = 0; i < numTasks - 2; i++) {
Object lowerBound = rangeBoundaries.get(i);
Object upperBound = rangeBoundaries.get(i + 1);
if (_wrapper.getDistance(key, lowerBound) >= 0
&& _wrapper.getDistance(upperBound, key) > 0) {
taskIndexes.add(i + 1);
}
}
if (taskIndexes.size() != 1) {
throw new RuntimeException(
"Developer error! Should not have size(targets) = "
+ taskIndexes.size());
}
return taskIndexes.get(0);
}
// works for _numTargetTasks >= 2
private List<Integer> chooseTargetIndexMulticast(Object hash) {
int op = _comparison.getOperation();
if (op == ComparisonPredicate.EQUAL_OP) {
return Arrays.asList(chooseTargetIndexNonMulticast(hash));
} else if (op == ComparisonPredicate.SYM_BAND_WITH_BOUNDS_OP
|| op == ComparisonPredicate.SYM_BAND_NO_BOUNDS_OP) {
// diff is inclusive
int diff = (Integer) _comparison.getDiff();
if (op == ComparisonPredicate.SYM_BAND_NO_BOUNDS_OP) {
diff--;
}
// it returns all the indexes in range of
// [chooseTargetIndexNonMulticast(hash - diff),
// chooseTargetIndexNonMulticast(hash + diff)]
Object minJoinable = _wrapper.getOffset(hash, -diff);
int minJoinableTargetIndex = chooseTargetIndexNonMulticast(minJoinable);
Object maxJoinable = _wrapper.getOffset(hash, diff);
int maxJoinableTargetIndex = chooseTargetIndexNonMulticast(maxJoinable);
List<Integer> targetIndexes = new ArrayList<Integer>();
for (int i = minJoinableTargetIndex; i <= maxJoinableTargetIndex; i++) {
targetIndexes.add(i);
}
return targetIndexes;
} else {
throw new RuntimeException("Unsupported operator " + op);
}
}
@Override
public void prepare(WorkerTopologyContext wtc, GlobalStreamId gsi,
List<Integer> targetTasks) {
// LOG.info("Target tasks for Range(Multicast) are " + targetTasks);
// seems sorted on each producer
_targetTasks = targetTasks;
_numTargetTasks = targetTasks.size();
// This class is used only for connecting other components to
// D2Component
if (_histType == null) {
// _numTargetTasks can differ from PAR_LAST_JOINERS - that's why we
// do this in the prepare method
createBoundariesStatic();
} else {
if (_numTargetTasks != _numLastJoiners) {
throw new RuntimeException(
"_numTargetTasks = "
+ _numTargetTasks
+ ", _numLastJoiners = "
+ _numLastJoiners
+ " and they differ.\n"
+ " One should not need it, but if needs, deserialization in createBoundariesFromHistogram should be changed.");
}
}
// check in both cases
checkBoundaries();
}
// divides MIN and MAX values from conf into _numTargetTasks segments
// there are _numTargetTasks - 1 boundaries
private void createBoundariesStatic() {
// no boundaries if only one targetTask
if (_numTargetTasks > 1) {
String d2MinStr = SystemParameters.getString(_map, "D2_EQUI_MIN");
String d2MaxStr = SystemParameters.getString(_map, "D2_EQUI_MAX");
Object d2Min = _wrapper.fromString(d2MinStr);
Object d2Max = _wrapper.fromString(d2MaxStr);
int distance = (int) _wrapper.getDistance(d2Max, d2Min);
int unitDistance = distance / _numTargetTasks;
for (int i = 0; i < _numTargetTasks - 1; i++) {
Object boundary = _wrapper.getOffset(d2Min, (i + 1)
* unitDistance);
_rangeBoundaries.add(boundary);
LOG.info("Added boundary " + boundary);
}
}
}
// there are parallelism(ex _numTargetTasks) - 1 boundaries
protected List createBoundariesFromHistogram(String filePrefix,
int parallelism) {
List result = new ArrayList();
// no boundaries if only parallelism = 1
if (parallelism > 1) {
String histogramFilename = MyUtilities.getHistogramFilename(_map,
parallelism, filePrefix);
LOG.info("HistogramFilename for " + filePrefix + " = "
+ histogramFilename);
// "Most impressive is that the entire process is JVM independent,
// meaning an object can be serialized on one platform and
// deserialized on an entirely different platform."
result = (List) DeepCopy.deserializeFromFile(histogramFilename);
LOG.info("Boundaries are " + result);
}
return result;
}
private void checkBoundaries() {
// TODO We require that no neighbor elements in _rangeBoundaries are the
// same
// Otherwise, RangeMulticastStreamGrouping.chooseTargetIndexNonMulticast
// does not work
// We will implement this only if we encounter the following runtime
// exception
// To do it, we need to add some probabilities, similar to
// ContentSensitiveMatrixAssignment
Object lastBoundary = null;
for (Object boundary : _rangeBoundaries) {
if (boundary.equals(lastBoundary)) {
// it suffices to compare only neighbors, as _rangeBoundaries
// are sorted, by definition
throw new RuntimeException(
"Two neighbor boundaries are the same: " + boundary);
}
lastBoundary = boundary;
}
// another checkup
if (_rangeBoundaries.size() != _numTargetTasks - 1) {
throw new RuntimeException("Developer error: Boundaries size is "
+ _rangeBoundaries.size() + " and _numTargetTasks = "
+ _numTargetTasks);
}
}
}