/**
* Copyright 2011 LiveRamp
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.liveramp.hank.cascading;
import java.io.IOException;
import java.nio.ByteBuffer;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.IntWritable;
import cascading.flow.FlowProcess;
import cascading.operation.BaseOperation;
import cascading.operation.Filter;
import cascading.operation.FilterCall;
import cascading.operation.Function;
import cascading.operation.FunctionCall;
import cascading.pipe.Each;
import cascading.pipe.GroupBy;
import cascading.pipe.Merge;
import cascading.pipe.Pipe;
import cascading.pipe.SubAssembly;
import cascading.tuple.Fields;
import cascading.tuple.Tuple;
import cascading.tuple.TupleEntry;
import com.liveramp.hank.coordinator.Coordinator;
import com.liveramp.hank.coordinator.Domain;
import com.liveramp.hank.coordinator.RunWithCoordinator;
import com.liveramp.hank.coordinator.RunnableWithCoordinator;
import com.liveramp.hank.hadoop.DomainBuilderProperties;
import com.liveramp.hank.hadoop.PartitionIntWritable;
import com.liveramp.hank.partitioner.Partitioner;
import com.liveramp.hank.storage.StorageEngine;
public class DomainBuilderAssembly extends SubAssembly {
private static final long serialVersionUID = 1L;
public static final String PARTITION_FIELD_NAME = "__hank_partition";
public static final String COMPARABLE_KEY_FIELD_NAME = "__hank_comparable_key";
private static final String PARTITION_MARKERS_PIPE_NAME_PREFIX = "__hank_partition_markers_for_";
private static final String SINK_NAME_PREFIX = "__hank_sink_for_";
public DomainBuilderAssembly(String domainName,
Pipe outputPipe,
String keyFieldName,
String valueFieldName) {
this(domainName, outputPipe, keyFieldName, valueFieldName, true, null);
}
public DomainBuilderAssembly(String domainName,
Pipe outputPipe,
String keyFieldName,
String valueFieldName,
boolean shouldPartitionAndSortInput,
Integer partitionToBuild) {
super(outputPipe);
Pipe partitionMarkersPipe = new Pipe(getPartitionMarkersPipeName(domainName));
// Add partition and comparable key fields
outputPipe = new Each(outputPipe,
new Fields(keyFieldName),
new AddPartitionAndComparableKeyFields(domainName, PARTITION_FIELD_NAME, COMPARABLE_KEY_FIELD_NAME),
new Fields(keyFieldName, valueFieldName, PARTITION_FIELD_NAME, COMPARABLE_KEY_FIELD_NAME));
// Filter partitions if necessary
if (partitionToBuild != null) {
outputPipe = new Each(outputPipe,
new Fields(PARTITION_FIELD_NAME),
new KeepPartitions(partitionToBuild));
partitionMarkersPipe = new Each(partitionMarkersPipe,
new Fields(PARTITION_FIELD_NAME),
new KeepPartitions(partitionToBuild));
}
Pipe tail;
if (shouldPartitionAndSortInput) {
// Group by partition id and secondary sort on comparable key
tail = new GroupBy(getSinkName(domainName),
new Pipe[]{partitionMarkersPipe, outputPipe},
new Fields(PARTITION_FIELD_NAME),
new Fields(COMPARABLE_KEY_FIELD_NAME));
} else {
// The input is considered to be already partitioned and sorted
tail = new Merge(outputPipe, partitionMarkersPipe);
}
setTails(tail);
}
private static class KeepPartitions extends BaseOperation implements Filter {
private final int partitionToKeep;
KeepPartitions(int partitionToKeep) {
super(1);
this.partitionToKeep = partitionToKeep;
}
@Override
public boolean isRemove(FlowProcess flowProcess, FilterCall filterCall) {
Integer partition = ((IntWritable)filterCall.getArguments().getObject(0)).get();
return partition != partitionToKeep;
}
}
private static class AddPartitionAndComparableKeyFields extends BaseOperation<AddPartitionAndComparableKeyFields> implements Function<AddPartitionAndComparableKeyFields> {
private static final long serialVersionUID = 1L;
transient private Integer domainNumParts;
transient private StorageEngine storageEngine;
transient private Partitioner partitioner;
private String domainName;
AddPartitionAndComparableKeyFields(String domainName, String partitionFieldName, String comparableKeyFieldName) {
super(1, new Fields(partitionFieldName, comparableKeyFieldName));
this.domainName = domainName;
}
public void operate(FlowProcess flowProcess, FunctionCall<AddPartitionAndComparableKeyFields> call) {
// Load configuration lazily
loadConfiguration(flowProcess);
// Compute partition and comparable key
TupleEntry tupleEntry = call.getArguments();
BytesWritable key = (BytesWritable)tupleEntry.getObject(0);
ByteBuffer keyByteBuffer = ByteBuffer.wrap(key.getBytes(), 0, key.getLength());
PartitionIntWritable partition = new PartitionIntWritable(partitioner.partition(keyByteBuffer, domainNumParts));
ByteBuffer comparableKey = storageEngine.getComparableKey(keyByteBuffer);
byte[] comparableKeyBuffer = new byte[comparableKey.remaining()];
System.arraycopy(comparableKey.array(), comparableKey.arrayOffset() + comparableKey.position(),
comparableKeyBuffer, 0, comparableKey.remaining());
BytesWritable comparableKeyBytesWritable = new BytesWritable(comparableKeyBuffer);
// Add partition and comparable key fields
call.getOutputCollector().add(new Tuple(partition, comparableKeyBytesWritable));
}
private void loadConfiguration(FlowProcess flowProcess) {
if (storageEngine == null || partitioner == null) {
try {
RunWithCoordinator.run(DomainBuilderProperties.getConfigurator(domainName, flowProcess),
new RunnableWithCoordinator() {
@Override
public void run(Coordinator coordinator) throws IOException {
Domain domain = DomainBuilderProperties.getDomain(coordinator, domainName);
domainNumParts = domain.getNumParts();
storageEngine = domain.getStorageEngine();
partitioner = domain.getPartitioner();
}
});
} catch (IOException e) {
throw new RuntimeException("Failed to load configuration.", e);
}
}
}
}
public static String getPartitionMarkersPipeName(String domainName) {
return PARTITION_MARKERS_PIPE_NAME_PREFIX + domainName;
}
public static String getSinkName(String domainName) {
return SINK_NAME_PREFIX + domainName;
}
}