/*
* Copyright (c) 2011-2015 EPFL DATA Laboratory
* Copyright (c) 2014-2015 The Squall Collaboration (see NOTICE)
*
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package ch.epfl.data.squall.ewh.storm_components;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import org.apache.log4j.Logger;
import backtype.storm.Config;
import backtype.storm.task.OutputCollector;
import backtype.storm.task.TopologyContext;
import backtype.storm.topology.InputDeclarer;
import backtype.storm.topology.OutputFieldsDeclarer;
import backtype.storm.topology.TopologyBuilder;
import backtype.storm.topology.base.BaseRichBolt;
import backtype.storm.tuple.Fields;
import backtype.storm.tuple.Tuple;
import ch.epfl.data.squall.predicates.ComparisonPredicate;
import ch.epfl.data.squall.storm_components.StormComponent;
import ch.epfl.data.squall.storm_components.StormEmitter;
import ch.epfl.data.squall.storm_components.synchronization.TopologyKiller;
import ch.epfl.data.squall.types.NumericType;
import ch.epfl.data.squall.utilities.DeepCopy;
import ch.epfl.data.squall.utilities.MyUtilities;
import ch.epfl.data.squall.utilities.SystemParameters;
import ch.epfl.data.squall.utilities.SystemParameters.HistogramType;
//equi-depth histogram on one or both input relations
public class EquiDepthHistogramBolt<JAT extends Number & Comparable<JAT>>
extends BaseRichBolt {
private static final long serialVersionUID = 1L;
private static Logger LOG = Logger.getLogger(EquiDepthHistogramBolt.class);
private StormEmitter _r1Emitter, _r2Emitter;
private String _r1EmitterIndex, _r2EmitterIndex;
private final String _componentName;
private NumericType _wrapper;
private ComparisonPredicate _comparison;
private Map _conf;
private OutputCollector _collector;
private int _numRemainingParents;
private int _numOfLastJoiners; // input to tiling algorithm
// private ListAdapter<JAT> _joinKeys = new ListJavaGeneric<JAT>();
// private ListAdapter<JAT> _joinKeys = new ListTIntAdapter();
// to avoid some boxing/unboxing, we could directly use TIntList
private List<JAT> _joinKeys1 = new ArrayList<JAT>();
private List<JAT> _joinKeys2 = new ArrayList<JAT>();
public EquiDepthHistogramBolt(StormEmitter r1Source, StormEmitter r2Source,
String componentName, int numOfLastJoiners,
NumericType<JAT> wrapper, ComparisonPredicate comparison,
List<String> allCompNames, TopologyBuilder builder,
TopologyKiller killer, Config conf) {
if (r2Source != null) { // is the data source which feeds D2Combiner
_r2Emitter = r2Source;
_r2EmitterIndex = String.valueOf(allCompNames.indexOf(r2Source
.getName()));
}
if (r1Source != null) { // is the data source one which feeds
// S1Reservoir directly
_r1Emitter = r1Source;
_r1EmitterIndex = String.valueOf(allCompNames.indexOf(r1Source
.getName()));
}
_componentName = componentName;
_numOfLastJoiners = numOfLastJoiners;
_conf = conf;
_comparison = comparison;
_wrapper = wrapper;
// _joinKeys = MyUtilities.createListAdapter(conf);
final int parallelism = 1;
// connecting with previous level
InputDeclarer currentBolt = builder.setBolt(componentName, this,
parallelism);
if (_r1Emitter != null) {
currentBolt = MyUtilities.attachEmitterToSingle(currentBolt,
_r1Emitter);
}
if (_r2Emitter != null) {
currentBolt = MyUtilities.attachEmitterToSingle(currentBolt,
_r2Emitter);
}
// connecting with Killer
// if (getHierarchyPosition() == FINAL_COMPONENT &&
// (!MyUtilities.isAckEveryTuple(conf)))
killer.registerComponent(this, componentName, parallelism);
}
@Override
public void execute(Tuple stormTupleRcv) {
final String inputComponentIndex = stormTupleRcv
.getStringByField(StormComponent.COMP_INDEX); // getString(0);
final List<String> tuple = (List<String>) stormTupleRcv
.getValueByField(StormComponent.TUPLE); // getValue(1);
if (processFinalAck(tuple, stormTupleRcv))
return;
processNonLastTuple(inputComponentIndex, tuple, stormTupleRcv, true);
_collector.ack(stormTupleRcv);
}
// from IRichBolt
@Override
public Map<String, Object> getComponentConfiguration() {
return _conf;
}
private void processNonLastTuple(String inputComponentIndex,
List<String> tuple, Tuple stormTupleRcv, boolean isLastInBatch) {
if (inputComponentIndex.equals(_r1EmitterIndex)) {
String key = tuple.get(0); // key is the only thing sent
_joinKeys1.add((JAT) _wrapper.fromString(key));
} else if (inputComponentIndex.equals(_r2EmitterIndex)) {
String key = tuple.get(0); // key is the only thing sent
_joinKeys2.add((JAT) _wrapper.fromString(key));
} else {
throw new RuntimeException("Unrecognized data source!");
}
}
@Override
public void declareOutputFields(OutputFieldsDeclarer declarer) {
// if (_hierarchyPosition == FINAL_COMPONENT) { // then its an
// intermediate
// stage not the final
// one
// if (!MyUtilities.isAckEveryTuple(_conf))
declarer.declareStream(SystemParameters.EOF_STREAM, new Fields(
SystemParameters.EOF));
}
// BaseRichSpout
@Override
public void prepare(Map map, TopologyContext tc, OutputCollector collector) {
// create list of non-null emitters
List<StormEmitter> emitters = new ArrayList<StormEmitter>();
if (_r1Emitter != null) {
emitters.add(_r1Emitter);
}
if (_r2Emitter != null) {
emitters.add(_r2Emitter);
}
_collector = collector;
_numRemainingParents = MyUtilities.getNumParentTasks(tc, emitters);
}
// if true, we should exit from method which called this method
protected boolean processFinalAck(List<String> tuple, Tuple stormTupleRcv) {
if (MyUtilities.isFinalAck(tuple, _conf)) {
_numRemainingParents--;
if (_numRemainingParents == 0) {
finalizeProcessing();
}
MyUtilities.processFinalAck(_numRemainingParents,
StormComponent.FINAL_COMPONENT, _conf, stormTupleRcv,
_collector);
return true;
}
return false;
}
private void finalizeProcessing() {
if (_r1Emitter != null) {
createHistogram(_joinKeys1, HistogramType.S1_RES_HIST.filePrefix());
}
if (_r2Emitter != null) {
createHistogram(_joinKeys2, HistogramType.D2_COMB_HIST.filePrefix());
}
}
private void createHistogram(List<JAT> joinKeys, String filePrefix) {
LOG.info("Before sorting keys");
// joinKeys.sort();
Collections.sort(joinKeys);
LOG.info("After sorting keys");
LOG.info("Keys size is " + joinKeys.size());
// create bucket boundaries
// choose keys equi-distantly such that in total there are
// _numOfLastJoiners of them
List<JAT> boundaries = createBoundaries(joinKeys, _numOfLastJoiners);
int size = boundaries.size();
LOG.info("Boundaries size is " + size);
if (size != _numOfLastJoiners - 1) {
throw new RuntimeException(
"Developer error! RangeMulticastStreamGrouping expects _numOfLastJoiners - 1 bondaries!");
}
String histogramFilename = MyUtilities.getHistogramFilename(_conf,
_numOfLastJoiners, filePrefix);
LOG.info("HistogramFilename " + filePrefix + " = " + histogramFilename);
// write Histogram
// "Most impressive is that the entire process is JVM independent,
// meaning an object can be serialized on one platform and deserialized
// on an entirely different platform."
DeepCopy.serializeToFile(boundaries, histogramFilename);
boundaries = (List) DeepCopy.deserializeFromFile(histogramFilename);
LOG.info("Boundaries are " + boundaries);
}
private List<JAT> createBoundaries(List<JAT> joinKeys, int numOfBuckets) {
if (numOfBuckets > joinKeys.size()) {
throw new RuntimeException(
"numOfJoiners at the last component has to be bigger than the number of join keys");
}
double distance = ((double) joinKeys.size()) / numOfBuckets;
if (distance < 1) {
throw new RuntimeException(
"A bug: same element cannot be included more than once!");
}
// ListAdapter<JAT> boundaries = MyUtilities.createListAdapter(_conf);
List<JAT> boundaries = new ArrayList<JAT>();
for (int i = 0; i < numOfBuckets; i++) {
// We want to avoid high discrepancy between bucket sizes, which is
// a consequence of input sample size != 100 * n_s
int index = (int) (i * distance + 0.5);
boundaries.add(joinKeys.get(index));
}
// RangeMulticastStreamGrouping does not need the start of the first
// bucket (it is -infinity anyway)
boundaries.remove(0);
return boundaries;
}
}