package com.hadooparchitecturebook.cascading.joinfilter; import java.util.Properties; import cascading.flow.Flow; import cascading.flow.FlowDef; import cascading.flow.FlowProcess; import cascading.flow.hadoop.HadoopFlowConnector; import cascading.operation.BaseOperation; import cascading.operation.Filter; import cascading.operation.FilterCall; import cascading.pipe.Each; import cascading.pipe.HashJoin; import cascading.pipe.Pipe; import cascading.property.AppProps; import cascading.property.ConfigDef.Mode; import cascading.scheme.hadoop.TextDelimited; import cascading.tap.Tap; import cascading.tap.hadoop.Hfs; import cascading.tuple.Fields; import cascading.tuple.collect.SpillableProps; public class JoinFilterExampleCascading { public static void main(String[] args) { String fooInputPath = args[0]; String barInputPath = args[1]; String outputPath = args[2]; int fooValMax = Integer.parseInt(args[3]); int joinValMax = Integer.parseInt(args[4]); int numberOfReducers = Integer.parseInt(args[5]); Properties properties = new Properties(); AppProps.setApplicationJarClass(properties, JoinFilterExampleCascading.class); properties.setProperty("mapred.reduce.tasks", Integer.toString(numberOfReducers)); properties.setProperty("mapreduce.job.reduces", Integer.toString(numberOfReducers)); SpillableProps props = SpillableProps.spillableProps() .setCompressSpill( true ) .setMapSpillThreshold( 50 * 1000 ); HadoopFlowConnector flowConnector = new HadoopFlowConnector(properties); // create source and sink taps Fields fooFields = new Fields("fooId", "fooVal", "foobarId"); Tap fooTap = new Hfs(new TextDelimited(fooFields, "|"), fooInputPath); Fields barFields = new Fields("barId", "barVal"); Tap barTap = new Hfs(new TextDelimited(barFields, "|"), barInputPath); Tap outputTap = new Hfs(new TextDelimited(false, "|"), outputPath); Fields joinFooFields = new Fields("foobarId"); Fields joinBarFields = new Fields("barId"); Pipe fooPipe = new Pipe("fooPipe"); Pipe barPipe = new Pipe("barPipe"); Pipe fooFiltered = new Each(fooPipe, fooFields, new FooFilter(fooValMax)); Pipe joinedPipe = new HashJoin(fooFiltered, joinFooFields, barPipe, joinBarFields); props.setProperties( joinedPipe.getConfigDef(), Mode.REPLACE ); Fields joinFields = new Fields("fooId", "fooVal", "foobarId", "barVal"); Pipe joinedFilteredPipe = new Each(joinedPipe, joinFields, new JoinedFilter(joinValMax)); FlowDef flowDef = FlowDef.flowDef().setName("wc") .addSource(fooPipe, fooTap).addSource(barPipe, barTap) .addTailSink(joinedFilteredPipe, outputTap); Flow wcFlow = flowConnector.connect(flowDef); wcFlow.writeDOT("dot/wc.dot"); wcFlow.complete(); } public static class FooFilter extends BaseOperation implements Filter { int fooValMax; FooFilter(int fooValMax) { this.fooValMax = fooValMax; } @Override public boolean isRemove(FlowProcess flowProcess, FilterCall filterCall) { int fooValue = filterCall.getArguments().getTuple().getInteger(1); return fooValue <= fooValMax; } } public static class JoinedFilter extends BaseOperation implements Filter { int joinValMax; JoinedFilter(int joinValMax) { this.joinValMax = joinValMax; } @Override public boolean isRemove(FlowProcess flowProcess, FilterCall filterCall) { int fooValue = filterCall.getArguments().getTuple().getInteger(1); int barValue = filterCall.getArguments().getTuple().getInteger(3); return fooValue + barValue <= joinValMax; } } }