package com.sequenceiq.cascading;
import cascading.flow.Flow;
import cascading.flow.FlowConnector;
import cascading.flow.FlowDef;
import cascading.flow.FlowRuntimeProps;
import cascading.flow.tez.Hadoop2TezFlowConnector;
import cascading.operation.aggregator.Count;
import cascading.operation.buffer.FirstNBuffer;
import cascading.pipe.Every;
import cascading.pipe.GroupBy;
import cascading.pipe.Pipe;
import cascading.property.AppProps;
import cascading.scheme.Scheme;
import cascading.scheme.hadoop.TextDelimited;
import cascading.tap.Tap;
import cascading.tap.hadoop.Hfs;
import cascading.tuple.Fields;
import java.util.Properties;
/**
* A TopK implementation of cascading
*/
public class Main {
public static void main( String[] args ) {
Properties properties = AppProps.appProps()
.setJarClass(Main.class)
.buildProperties();
properties = FlowRuntimeProps.flowRuntimeProps()
.setGatherPartitions(1)
.buildProperties(properties);
FlowConnector flowConnector = new Hadoop2TezFlowConnector(properties);
final String inputPath = args[0];
final String outputPath = args[1];
final int top = Integer.parseInt(args[2]);
final Fields fields = new Fields("userId", "data1", "data2", "data3");
final Scheme scheme = new TextDelimited(fields, false, true, ",");
final Pipe inPipe = new Pipe("inPipe");
final Tap inTap = new Hfs(scheme, inputPath);
final Fields groupFields = new Fields("userId");
Pipe topUsersPipe = new GroupBy("topUsers", inPipe, groupFields);
topUsersPipe = new Every(topUsersPipe, groupFields, new Count(), Fields.ALL);
topUsersPipe = new GroupBy(topUsersPipe, Fields.NONE, new Fields("count", "userId"), true);
topUsersPipe = new Every(topUsersPipe, Fields.ALL, new FirstNBuffer(top), Fields.ARGS);
final Fields resultFields = new Fields("userId", "count");
final Scheme outputScheme = new TextDelimited(resultFields, false, true, ",");
Tap sinkTap = new Hfs(outputScheme, outputPath);
FlowDef flowDef = FlowDef.flowDef()
.setName("TopK-TEZ")
.addSource(inPipe, inTap)
.addTailSink(topUsersPipe, sinkTap);
Flow flow = flowConnector.connect(flowDef);
flow.complete();
System.exit(0);
}
}