/*
* Copyright (c) 2011-2015 EPFL DATA Laboratory
* Copyright (c) 2014-2015 The Squall Collaboration (see NOTICE)
*
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package ch.epfl.data.squall.connectors.hdfs;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import org.apache.commons.lang.ArrayUtils;
import org.apache.storm.hdfs.bolt.HdfsBolt;
import org.apache.storm.hdfs.bolt.format.DefaultFileNameFormat;
import org.apache.storm.hdfs.bolt.format.DelimitedRecordFormat;
import org.apache.storm.hdfs.bolt.format.FileNameFormat;
import org.apache.storm.hdfs.bolt.format.RecordFormat;
import org.apache.storm.hdfs.bolt.rotation.FileRotationPolicy;
import org.apache.storm.hdfs.bolt.rotation.FileSizeRotationPolicy;
import org.apache.storm.hdfs.bolt.rotation.FileSizeRotationPolicy.Units;
import org.apache.storm.hdfs.bolt.sync.CountSyncPolicy;
import org.apache.storm.hdfs.bolt.sync.SyncPolicy;
import backtype.storm.Config;
import backtype.storm.topology.TopologyBuilder;
import backtype.storm.topology.base.BaseRichBolt;
import ch.epfl.data.squall.components.Component;
import ch.epfl.data.squall.components.DataSourceComponent;
import ch.epfl.data.squall.expressions.ValueExpression;
import ch.epfl.data.squall.operators.ChainOperator;
import ch.epfl.data.squall.operators.Operator;
import ch.epfl.data.squall.predicates.Predicate;
import ch.epfl.data.squall.storm_components.StormComponent;
import ch.epfl.data.squall.storm_components.StormEmitter;
import ch.epfl.data.squall.storm_components.synchronization.TopologyKiller;
import ch.epfl.data.squall.types.Type;
import ch.epfl.data.squall.utilities.MyUtilities;
public class HDFSmaterializer implements Component {
private final String _componentName;
private long _batchOutputMillis;
private List<Integer> _hashIndexes;
private List<ValueExpression> _hashExpressions;
private final ChainOperator _chain = new ChainOperator();
private boolean _printOut;
private boolean _printOutSet;
private final Component _parent;
private Component _child;
// private StormOperator _stormOperator;
private List<String> _fullHashList;
private String _hdfsPath;
private String _folderName;
private int _parallelism;
public HDFSmaterializer(Component parent, String componentName,
String hdfsPath, String folderName, int parallelism) {
_parent = parent;
_parent.setChild(this);
_componentName = componentName;
_hdfsPath = hdfsPath;
_folderName = folderName;
_parallelism = parallelism;
}
public BaseRichBolt createHDFSmaterializer(String hdfsPath) {
RecordFormat format = new DelimitedRecordFormat()
.withFieldDelimiter("|");
// sync the filesystem after every 1k tuples
SyncPolicy syncPolicy = new CountSyncPolicy(1000);
// rotate files when they reach 5MB
FileRotationPolicy rotationPolicy = new FileSizeRotationPolicy(5.0f,
Units.MB);
FileNameFormat fileNameFormat = new DefaultFileNameFormat().withPath(
"/" + _folderName + "/").withExtension(".txt");
HdfsBolt bolt = new HdfsBolt().withFsUrl(hdfsPath)
.withFileNameFormat(fileNameFormat).withRecordFormat(format)
.withRotationPolicy(rotationPolicy).withSyncPolicy(syncPolicy);
return bolt;
}
@Override
public HDFSmaterializer add(Operator operator) {
_chain.addOperator(operator);
return this;
}
@Override
public boolean equals(Object obj) {
if (obj instanceof Component)
return _componentName.equals(((Component) obj).getName());
else
return false;
}
@Override
public List<DataSourceComponent> getAncestorDataSources() {
final List<DataSourceComponent> list = new ArrayList<DataSourceComponent>();
list.addAll(_parent.getAncestorDataSources());
return list;
}
@Override
public long getBatchOutputMillis() {
return _batchOutputMillis;
}
@Override
public ChainOperator getChainOperator() {
return _chain;
}
@Override
public Component getChild() {
return _child;
}
// from StormComponent
@Override
public String[] getEmitterIDs() {
throw new RuntimeException("Not implemented yet");
}
@Override
public List<String> getFullHashList() {
return _fullHashList;
}
@Override
public List<ValueExpression> getHashExpressions() {
return _hashExpressions;
}
@Override
public List<Integer> getHashIndexes() {
return _hashIndexes;
}
@Override
public String getInfoID() {
throw new RuntimeException("Not implemented yet");
}
@Override
public String getName() {
return _componentName;
}
@Override
public Component[] getParents() {
return new Component[] { _parent };
}
@Override
public boolean getPrintOut() {
return _printOut;
}
@Override
public int hashCode() {
int hash = 5;
hash = 47 * hash
+ (_componentName != null ? _componentName.hashCode() : 0);
return hash;
}
@Override
public void makeBolts(TopologyBuilder builder, TopologyKiller killer,
List<String> allCompNames, Config conf, int hierarchyPosition) {
// by default print out for the last component
// for other conditions, can be set via setPrintOut
if (hierarchyPosition == StormComponent.FINAL_COMPONENT
&& !_printOutSet)
setPrintOut(true);
MyUtilities.checkBatchOutput(_batchOutputMillis,
_chain.getAggregation(), conf);
BaseRichBolt hdfsBolt = createHDFSmaterializer(_hdfsPath);
builder.setBolt("hdfs", hdfsBolt, _parallelism).shuffleGrouping(
((StormEmitter) _parent).getEmitterIDs()[0]);
}
@Override
public HDFSmaterializer setBatchOutputMillis(long millis) {
_batchOutputMillis = millis;
return this;
}
@Override
public void setChild(Component child) {
_child = child;
}
@Override
public Component setContentSensitiveThetaJoinWrapper(Type wrapper) {
return this;
}
@Override
public HDFSmaterializer setFullHashList(List<String> fullHashList) {
_fullHashList = fullHashList;
return this;
}
@Override
public HDFSmaterializer setHashExpressions(
List<ValueExpression> hashExpressions) {
_hashExpressions = hashExpressions;
return this;
}
@Override
public HDFSmaterializer setOutputPartKey(int... hashIndexes) {
return setOutputPartKey(Arrays.asList(ArrayUtils.toObject(hashIndexes)));
}
@Override
public HDFSmaterializer setOutputPartKey(List<Integer> hashIndexes) {
_hashIndexes = hashIndexes;
return this;
}
@Override
public HDFSmaterializer setPrintOut(boolean printOut) {
_printOutSet = true;
_printOut = printOut;
return this;
}
}