package com.hadooparchitecturebook.crunch; import org.apache.hadoop.conf.Configuration; import org.apache.crunch.lib.join.DefaultJoinStrategy; import org.apache.crunch.FilterFn; import org.apache.crunch.MapFn; import org.apache.crunch.impl.mr.MRPipeline; import org.apache.crunch.impl.mr.MRPipeline; import org.apache.crunch.Pair; import org.apache.crunch.PCollection; import org.apache.crunch.Pipeline; import org.apache.crunch.PipelineResult; import org.apache.crunch.PTable; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import org.apache.crunch.types.writable.Writables; import org.apache.crunch.types.avro.Avros; import org.apache.crunch.lib.join.JoinType; import org.apache.commons.lang.StringUtils; import org.apache.crunch.Target.WriteMode; import org.apache.crunch.io.At; public class JoinFilterExampleCrunch implements Tool { public static final int FOO_ID_INX = 0; public static final int FOO_VALUE_INX = 1; public static final int FOO_BAR_ID_INX = 2; public static final int BAR_ID_INX = 0; public static final int BAR_VALUE_INX = 1; public static void main(String[] args) throws Exception { ToolRunner.run(new Configuration(), new JoinFilterExampleCrunch(), args); } Configuration config; public Configuration getConf() { return config; } public void setConf(Configuration config) { this.config = config; } public int run(String[] args) throws Exception { String fooInputPath = args[0]; String barInputPath = args[1]; String outputPath = args[2]; int fooValMax = Integer.parseInt(args[3]); int joinValMax = Integer.parseInt(args[4]); int numberOfReducers = Integer.parseInt(args[5]); Pipeline pipeline = new MRPipeline(JoinFilterExampleCrunch.class, getConf()); //<1> PCollection<String> fooLines = pipeline.readTextFile(fooInputPath); //<2> PCollection<String> barLines = pipeline.readTextFile(barInputPath); PTable<Long, Pair<Long, Integer>> fooTable = fooLines.parallelDo( //<3> new FooIndicatorFn(), Avros.tableOf(Avros.longs(), Avros.pairs(Avros.longs(), Avros.ints()))); fooTable = fooTable.filter(new FooFilter(fooValMax)); //<4> PTable<Long, Integer> barTable = barLines.parallelDo(new BarIndicatorFn(), Avros.tableOf(Avros.longs(), Avros.ints())); DefaultJoinStrategy<Long, Pair<Long, Integer>, Integer> joinStrategy = //<5> new DefaultJoinStrategy <Long, Pair<Long, Integer>, Integer> (numberOfReducers); PTable<Long, Pair<Pair<Long, Integer>, Integer>> joinedTable = joinStrategy //<6> .join(fooTable, barTable, JoinType.INNER_JOIN); PTable<Long, Pair<Pair<Long, Integer>, Integer>> filteredTable = joinedTable.filter(new JoinFilter(joinValMax)); filteredTable.write(At.textFile(outputPath), WriteMode.OVERWRITE); //<7> PipelineResult result = pipeline.done(); return result.succeeded() ? 0 : 1; } public static class FooIndicatorFn extends MapFn<String, Pair<Long, Pair<Long, Integer>>> { private static final long serialVersionUID = 1L; @Override public Pair<Long, Pair<Long, Integer>> map(String input) { String[] cells = StringUtils.split(input.toString(), "|"); Pair<Long, Integer> valuePair = new Pair<Long, Integer>( Long.parseLong(cells[FOO_ID_INX]), Integer.parseInt(cells[FOO_VALUE_INX])); return new Pair<Long, Pair<Long, Integer>>( Long.parseLong(cells[FOO_BAR_ID_INX]), valuePair); } } public static class FooFilter extends FilterFn<Pair<Long, Pair<Long, Integer>>> { private static final long serialVersionUID = 1L; int fooValMax; FooFilter(int fooValMax) { this.fooValMax = fooValMax; } @Override public boolean accept(Pair<Long, Pair<Long, Integer>> input) { return input.second().second() <= fooValMax; } } public static class BarIndicatorFn extends MapFn<String, Pair<Long, Integer>> { private static final long serialVersionUID = 1L; @Override public Pair<Long, Integer> map(String input) { String[] cells = StringUtils.split(input.toString(), "|"); return new Pair<Long, Integer>(Long.parseLong(cells[BAR_ID_INX]), Integer.parseInt(cells[BAR_VALUE_INX])); } } public static class JoinFilter extends FilterFn<Pair<Long, Pair<Pair<Long, Integer>, Integer>>> { private static final long serialVersionUID = 1L; int joinValMax; JoinFilter(int joinValMax) { this.joinValMax = joinValMax; } @Override public boolean accept(Pair<Long, Pair<Pair<Long, Integer>, Integer>> input) { return input.second().first().second() + input.second().second() <= joinValMax; } } }