/* * Copyright (c) 2011-2015 EPFL DATA Laboratory * Copyright (c) 2014-2015 The Squall Collaboration (see NOTICE) * * All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package ch.epfl.data.squall.storm_components.hyper_cube; import org.apache.log4j.Logger; import java.util.ArrayList; import java.util.Arrays; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.HashMap; import java.util.Set; import java.util.concurrent.Semaphore; import java.net.URL; import java.io.File; import java.net.URLClassLoader; import java.net.MalformedURLException; import backtype.storm.Config; import backtype.storm.task.OutputCollector; import backtype.storm.task.TopologyContext; import backtype.storm.topology.InputDeclarer; import backtype.storm.topology.TopologyBuilder; import backtype.storm.tuple.Tuple; import ch.epfl.data.squall.components.ComponentProperties; import ch.epfl.data.squall.operators.AggregateOperator; import ch.epfl.data.squall.operators.AggregateStream; import ch.epfl.data.squall.operators.ChainOperator; import ch.epfl.data.squall.operators.Operator; import ch.epfl.data.squall.storm_components.StormBoltComponent; import ch.epfl.data.squall.storm_components.StormComponent; import ch.epfl.data.squall.storm_components.StormEmitter; import ch.epfl.data.squall.storm_components.synchronization.TopologyKiller; import ch.epfl.data.squall.storm_components.hash_hypercube.HashHyperCubeGrouping.EmitterDesc; import ch.epfl.data.squall.thetajoin.matrix_assignment.HashHyperCubeAssignment; import ch.epfl.data.squall.thetajoin.matrix_assignment.HashHyperCubeAssignmentBruteForce; import ch.epfl.data.squall.thetajoin.matrix_assignment.HashHyperCubeAssignmentBruteForce.ColumnDesc; import ch.epfl.data.squall.thetajoin.matrix_assignment.HyperCubeAssignerFactory; import ch.epfl.data.squall.thetajoin.matrix_assignment.HyperCubeAssignment; import ch.epfl.data.squall.thetajoin.matrix_assignment.ManualHybridHyperCubeAssignment.Dimension; import ch.epfl.data.squall.thetajoin.matrix_assignment.HybridHyperCubeAssignment; import ch.epfl.data.squall.thetajoin.matrix_assignment.ManualHybridHyperCubeAssignment; import ch.epfl.data.squall.thetajoin.matrix_assignment.HybridHyperCubeAssignmentBruteForce; import ch.epfl.data.squall.types.Type; import ch.epfl.data.squall.utilities.MyUtilities; import ch.epfl.data.squall.utilities.PartitioningScheme; import ch.epfl.data.squall.utilities.PeriodicAggBatchSend; import ch.epfl.data.squall.utilities.SystemParameters; import ch.epfl.data.squall.utilities.statistics.StatisticsUtilities; import ch.epfl.data.squall.predicates.Predicate; import ch.epfl.data.squall.storage.TupleStorage; import ch.epfl.data.squall.storm_components.StormJoinerBoltComponent; import ch.epfl.data.squall.window_semantics.WindowSemanticsManager; public class TradionalTwoWayJoin extends StormJoinerBoltComponent { private static final long serialVersionUID = 1L; private static Logger LOG = Logger.getLogger(TradionalTwoWayJoin.class); private final TupleStorage _firstRelationStorage, _secondRelationStorage; private List<StormEmitter> _emitters; private Map<String, String[]> _emitterColNames; private Map<String, Type[]> _emitterNamesColTypes; private Set<String> _randomColumns; public TradionalTwoWayJoin(StormEmitter firstEmitter, StormEmitter secondEmitter, Map<String, String[]> emitterColNames, Map<String, Type[]> emitterNamesColTypes, Set<String> randomColumns, ComponentProperties cp, List<String> allCompNames, Predicate joinPredicate, boolean isPartitioner, int hierarchyPosition, TopologyBuilder builder, TopologyKiller killer, Config conf, boolean isContentSensitive, Type wrapper) { super(firstEmitter, secondEmitter, cp, allCompNames, joinPredicate, hierarchyPosition, builder, killer, isPartitioner, conf); _statsUtils = new StatisticsUtilities(getConf(), LOG); final int parallelism = SystemParameters.getInt(conf, getID() + "_PAR"); InputDeclarer currentBolt = builder.setBolt(getID(), this, parallelism); _emitterColNames = emitterColNames; _emitterNamesColTypes = emitterNamesColTypes; _randomColumns = randomColumns; _emitters = new ArrayList<StormEmitter>(); _emitters.add(firstEmitter); _emitters.add(secondEmitter); currentBolt = attachEmitters(conf, currentBolt, allCompNames, parallelism); if (getHierarchyPosition() == FINAL_COMPONENT && (!MyUtilities.isAckEveryTuple(conf))) killer.registerComponent(this, parallelism); if (cp.getPrintOut() && _operatorChain.isBlocking()) currentBolt.allGrouping(killer.getID(), SystemParameters.DUMP_RESULTS_STREAM); _firstRelationStorage = new TupleStorage(); _secondRelationStorage = new TupleStorage(); if (_joinPredicate != null) { createIndexes(); _existIndexes = true; } else _existIndexes = false; } private InputDeclarer attachEmitters(Config conf, InputDeclarer currentBolt, List<String> allCompNames, int parallelism) { switch (getPartitioningScheme(conf)) { case BRUTEFORCEHYBRIDHYPERCUBE: long[] cardinality = getEmittersCardinality(_emitters, conf); List<ColumnDesc> columns = getColumnDesc(cardinality, _emitters); List<EmitterDesc> emittersDesc = MyUtilities.getEmitterDesc( _emitters, _emitterColNames, allCompNames, cardinality); LOG.info("cardinalities: " + Arrays.toString(cardinality)); HybridHyperCubeAssignment _currentHybridHyperCubeMappingAssignment = new HybridHyperCubeAssignmentBruteForce(emittersDesc, columns, _randomColumns, parallelism); LOG.info("assignment: " + _currentHybridHyperCubeMappingAssignment.getMappingDimensions()); currentBolt = MyUtilities.attachEmitterHybridHyperCube(currentBolt, _emitters, _emitterColNames, allCompNames, _currentHybridHyperCubeMappingAssignment, emittersDesc, conf); break; case HASHHYPERCUBE: cardinality = getEmittersCardinality(_emitters, conf); LOG.info("cardinalities: " + Arrays.toString(cardinality)); columns = getColumnDesc(cardinality, _emitters); emittersDesc = MyUtilities.getEmitterDesc( _emitters, _emitterColNames, allCompNames, cardinality); HashHyperCubeAssignment _currentHashHyperCubeMappingAssignment = new HashHyperCubeAssignmentBruteForce(parallelism, columns, emittersDesc); LOG.info("assignment: " + _currentHashHyperCubeMappingAssignment.getMappingDimensions()); currentBolt = MyUtilities.attachEmitterHashHyperCube(currentBolt, _emitters, _emitterColNames, _currentHashHyperCubeMappingAssignment, emittersDesc, conf); break; case HYPERCUBE: cardinality = getEmittersCardinality(_emitters, conf); LOG.info("cardinalities: " + Arrays.toString(cardinality)); final HyperCubeAssignment _currentHyperCubeMappingAssignment = new HyperCubeAssignerFactory().getAssigner(parallelism, cardinality); LOG.info("assignment: " + _currentHyperCubeMappingAssignment.getMappingDimensions()); currentBolt = MyUtilities.attachEmitterHyperCube(currentBolt, _emitters, allCompNames, _currentHyperCubeMappingAssignment, conf); break; } return currentBolt; } private List<ColumnDesc> getColumnDesc(long[] cardinality, List<StormEmitter> emitters) { HashMap<String, ColumnDesc> tmp = new HashMap<String, ColumnDesc>(); List<ColumnDesc> desc = new ArrayList<ColumnDesc>(); for (int i = 0; i < emitters.size(); i++) { String emitterName = emitters.get(i).getName(); Type[] columnTypes = _emitterNamesColTypes.get(emitterName); String[] columnNames = _emitterColNames.get(emitterName); for (int j = 0; j < columnNames.length; j++) { ColumnDesc cd = new ColumnDesc(columnNames[j], columnTypes[j], cardinality[i]); if (tmp.containsKey(cd.name)) { cd.size += tmp.get(cd.name).size; tmp.put(cd.name, cd); } else { tmp.put(cd.name, cd); } } } for (String key : tmp.keySet()) { desc.add(tmp.get(key)); } return desc; } private PartitioningScheme getPartitioningScheme(Config conf) { String schemeName = SystemParameters.getString(conf, getName() + "_PART_SCHEME"); if (schemeName == null || schemeName.equals("")) { LOG.info("use default Hypercube partitioning scheme"); return PartitioningScheme.HYPERCUBE; } else { LOG.info("use partitioning scheme : " + schemeName); return PartitioningScheme.valueOf(schemeName); } } private long[] getEmittersCardinality(List<StormEmitter> emitters, Config conf) { long[] cardinality = new long[emitters.size()]; for (int i = 0; i < emitters.size(); i++) { cardinality[i] = SystemParameters.getInt(conf, emitters.get(i).getName() + "_CARD"); } return cardinality; } @Override public void execute(Tuple stormTupleRcv) { // TODO // short circuit that this is a window configuration if (WindowSemanticsManager.evictStateIfSlidingWindowSemantics(this, stormTupleRcv)) { return; } if (_firstTime && MyUtilities.isAggBatchOutputMode(_aggBatchOutputMillis)) { _periodicAggBatch = new PeriodicAggBatchSend(_aggBatchOutputMillis, this); _firstTime = false; } if (receivedDumpSignal(stormTupleRcv)) { MyUtilities.dumpSignal(this, stormTupleRcv, getCollector()); return; } if (!MyUtilities.isManualBatchingMode(getConf())) { final String inputComponentIndex = stormTupleRcv .getStringByField(StormComponent.COMP_INDEX); // getString(0); final List<String> tuple = (List<String>) stormTupleRcv .getValueByField(StormComponent.TUPLE); // getValue(1); final String inputTupleHash = stormTupleRcv .getStringByField(StormComponent.HASH);// getString(2); if (processFinalAck(tuple, stormTupleRcv)) return; final String inputTupleString = MyUtilities.tupleToString(tuple, getConf()); processNonLastTuple(inputComponentIndex, tuple, inputTupleHash, stormTupleRcv, true, _firstRelationStorage, _secondRelationStorage); } else { final String inputComponentIndex = stormTupleRcv .getStringByField(StormComponent.COMP_INDEX); // getString(0); final String inputBatch = stormTupleRcv .getStringByField(StormComponent.TUPLE);// getString(1); final String[] wholeTuples = inputBatch .split(SystemParameters.MANUAL_BATCH_TUPLE_DELIMITER); final int batchSize = wholeTuples.length; for (int i = 0; i < batchSize; i++) { // parsing final String currentTuple = new String(wholeTuples[i]); final String[] parts = currentTuple .split(SystemParameters.MANUAL_BATCH_HASH_DELIMITER); String inputTupleHash = null; String inputTupleString = null; if (parts.length == 1) // lastAck inputTupleString = new String(parts[0]); else { inputTupleHash = new String(parts[0]); inputTupleString = new String(parts[1]); } final List<String> tuple = MyUtilities.stringToTuple( inputTupleString, getConf()); // final Ack check if (processFinalAck(tuple, stormTupleRcv)) { if (i != batchSize - 1) throw new RuntimeException( "Should not be here. LAST_ACK is not the last tuple!"); return; } // processing a tuple if (i == batchSize - 1) processNonLastTuple(inputComponentIndex, tuple, inputTupleHash, stormTupleRcv, true, _firstRelationStorage, _secondRelationStorage); else processNonLastTuple(inputComponentIndex, tuple, inputTupleHash, stormTupleRcv, false, _firstRelationStorage, _secondRelationStorage); } } // TODO // Update LatestTimeStamp WindowSemanticsManager.updateLatestTimeStamp(this, stormTupleRcv); getCollector().ack(stormTupleRcv); } @Override protected void printStatistics(int type) { printStatistics(type, _firstRelationStorage.size(), _secondRelationStorage.size(), LOG); } // TODO WINDOW Semantics @Override public void purgeStaleStateFromWindow() { _firstRelationStorage.purgeState(_latestTimeStamp - WindowSemanticsManager._GC_PERIODIC_TICK, _firstRelationIndexes, _joinPredicate, getConf(), true); _secondRelationStorage.purgeState(_latestTimeStamp - WindowSemanticsManager._GC_PERIODIC_TICK, _secondRelationIndexes, _joinPredicate, getConf(), false); System.gc(); } }