package org.bigtop.bigpetstore.etl; import java.util.Map; import org.apache.crunch.FilterFn; import org.apache.crunch.MapFn; import org.apache.crunch.PCollection; import org.apache.crunch.PTable; import org.apache.crunch.Pair; import org.apache.crunch.Pipeline; import org.apache.crunch.impl.mem.MemPipeline; import org.apache.crunch.impl.mr.MRPipeline; import org.apache.crunch.io.From; import org.apache.crunch.types.avro.Avros; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.bigtop.bigpetstore.contract.PetStoreStatistics; public class CrunchETL extends PetStoreStatistics { public static MapFn<LineItem, String> COUNT_BY_PRODUCT = new MapFn<LineItem, String>() { public String map(LineItem lineItem) { try { return lineItem.getDescription(); } catch (Throwable t) { throw new RuntimeException(t); } } }; public static MapFn<LineItem, String> COUNT_BY_STATE = new MapFn<LineItem, String>() { public String map(LineItem lineItem) { try { return lineItem.getDescription(); } catch (Throwable t) { throw new RuntimeException(t); } } }; PCollection<LineItem> lineItems; public CrunchETL(Path input, Path output) throws Exception { Pipeline pipeline = MemPipeline.getInstance(); PCollection<String> lines = pipeline.read(From.textFile(new Path(input, "part-r-00000"))); System.out.println("crunch : " + lines.getName() + " " + lines.getSize()); lineItems = lines.parallelDo(ETL, Avros.reflects(LineItem.class)); } public static MapFn ETL = new MapFn<String, LineItem>() { @Override public LineItem map(String input) { String[] fields = input.split(","); LineItem li = new LineItem(); li.setAppName(fields[1]); li.setFirstName(fields[3]); // ... li.setDescription(fields[fields.length - 1]); return li; } }; @Override public Map<String, ? extends Number> numberOfTransactionsByState() throws Exception { PTable<String, Long> counts = lineItems.parallelDo(COUNT_BY_STATE, Avros.strings()).count(); Map m = counts.materializeToMap(); System.out.println("Crunch::: " + m); return m; } @Override public Map<String, ? extends Number> numberOfProductsByProduct() throws Exception { PTable<String, Long> counts = lineItems.parallelDo(COUNT_BY_PRODUCT, Avros.strings()).count(); Map m = counts.materializeToMap(); //CrunchETL. System.out.println("Crunch::: " + m); return m; } public static void main(String... args) throws Exception { /** * PCollection<String> lines = MemPipeline .collectionOf( * "BigPetStore,storeCode_AK,1 lindsay,franco,Sat Jan 10 00:11:10 EST 1970,10.5,dog-food" * "BigPetStore,storeCode_AZ,1 tom,giles,Sun Dec 28 23:08:45 EST 1969,10.5,dog-food" * "BigPetStore,storeCode_CA,1 brandon,ewing,Mon Dec 08 20:23:57 EST 1969,16.5,organic-dog-food" * "BigPetStore,storeCode_CA,2 angie,coleman,Thu Dec 11 07:00:31 EST 1969,10.5,dog-food" * "BigPetStore,storeCode_CA,3 angie,coleman,Tue Jan 20 06:24:23 EST 1970,7.5,cat-food" * "BigPetStore,storeCode_CO,1 sharon,trevino,Mon Jan 12 07:52:10 EST 1970,30.1,antelope snacks" * "BigPetStore,storeCode_CT,1 kevin,fitzpatrick,Wed Dec 10 05:24:13 EST 1969,10.5,dog-food" * "BigPetStore,storeCode_NY,1 dale,holden,Mon Jan 12 23:02:13 EST 1970,19.75,fish-food" * "BigPetStore,storeCode_NY,2 dale,holden,Tue Dec 30 12:29:52 EST 1969,10.5,dog-food" * "BigPetStore,storeCode_OK,1 donnie,tucker,Sun Jan 18 04:50:26 EST 1970,7.5,cat-food" * ); **/ // FAILS Pipeline pipeline = new MRPipeline(CrunchETL.class); PCollection<String> lines = pipeline.read(From.textFile(new Path( "/tmp/BigPetStore1388719888255/generated/part-r-00000"))); PCollection<LineItem> lineItems = lines.parallelDo( new MapFn<String, LineItem>() { @Override public LineItem map(String input) { System.out.println("proc1 " + input); String[] fields = input.split(","); LineItem li = new LineItem(); li.setAppName("" + fields[1]); li.setFirstName("" + fields[3]); li.setDescription("" + fields[fields.length - 1]); return li; } }, Avros.reflects(LineItem.class)); for (LineItem i : lineItems.materialize()) System.out.println(i); // System.out.println(lineItems.getSize() + " " + lineItems.count() + // " " + lineItems.asCollection().getValue().size()); } }