package com.sequenceiq.cascading; import cascading.flow.Flow; import cascading.flow.FlowConnector; import cascading.flow.FlowDef; import cascading.flow.FlowRuntimeProps; import cascading.flow.tez.Hadoop2TezFlowConnector; import cascading.operation.Filter; import cascading.operation.aggregator.Count; import cascading.operation.buffer.FirstNBuffer; import cascading.operation.expression.ExpressionFilter; import cascading.operation.regex.RegexFilter; import cascading.pipe.Each; import cascading.pipe.Every; import cascading.pipe.GroupBy; import cascading.pipe.Pipe; import cascading.property.AppProps; import cascading.scheme.Scheme; import cascading.scheme.hadoop.TextDelimited; import cascading.tap.Tap; import cascading.tap.hadoop.Hfs; import cascading.tuple.Fields; import java.util.Properties; public class Main { public static void main( String[] args ) { Properties properties = AppProps.appProps() .setJarClass(Main.class) .buildProperties(); properties = FlowRuntimeProps.flowRuntimeProps() .setGatherPartitions(1) .buildProperties(properties); FlowConnector flowConnector = new Hadoop2TezFlowConnector(properties); final String inputPath = args[0]; final String outputPath = args[1]; final Fields fields = new Fields("userId", "data1", "data2", "data3"); final Scheme scheme = new TextDelimited(fields, false, true, ","); final Pipe inPipe = new Pipe("inPipe"); final Tap inTap = new Hfs(scheme, inputPath); final Fields groupFields = new Fields("userId"); Pipe usersPipe = new GroupBy("usersWithCount", inPipe, groupFields); usersPipe = new Every(usersPipe, groupFields, new Count(), Fields.ALL); usersPipe = new GroupBy(usersPipe, Fields.NONE, new Fields("count", "userId"), true); usersPipe = new Each(usersPipe, new Fields("count"), new RegexFilter( "^(?:[2-9]|(?:[1-9][0-9]+))" )); final Fields resultFields = new Fields("userId", "count"); final Scheme outputScheme = new TextDelimited(resultFields, false, true, ","); Tap sinkTap = new Hfs(outputScheme, outputPath); FlowDef flowDef = FlowDef.flowDef() .setName("Cascading-TEZ") .addSource(inPipe, inTap) .addTailSink(usersPipe, sinkTap); Flow flow = flowConnector.connect(flowDef); flow.complete(); System.exit(0); } }