package com.github.projectflink.avro;
import com.github.projectflink.avro.generated.AvroLineitem;
import org.apache.flink.api.common.functions.CoGroupFunction;
import org.apache.flink.api.java.DataSet;
import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.flink.api.java.io.AvroInputFormat;
import org.apache.flink.api.java.io.DiscardingOutputFormat;
import org.apache.flink.core.fs.Path;
import org.apache.flink.util.Collector;
import java.util.Iterator;
/**
*
* Required steps:
* - Run Generate Lineitems: ./flink run -v -p 152 -c com.github.projectflink.avro.GenerateLineitems ../../testjob/flink-jobs/target/flink-jobs-0.1-SNAPSHOT.jar -p 152 -o hdfs:///user/robert/datasets/tpch1/
* - Run Prepare ./flink run -v -p 152 -c com.github.projectflink.avro.Prepare ../../testjob/flink-jobs/target/flink-jobs-0.1-SNAPSHOT.jar hdfs:///user/robert/datasets/tpch1/lineitems.csv hdfs:///user/robert/datasets/tpch1-avro/
* - Run Compare. ./flink run -v -p 152 -c com.github.projectflink.avro.CompareJob ../../testjob/flink-jobs/target/flink-jobs-0.1-SNAPSHOT.jar hdfs:///user/robert/datasets/tpch1-avro/ hdfs:///user/robert/datasets/tpch1/lineitems.csv
*
*
* This job reads the Lineitem file from text and avro and compares if they match.
*/
public class CompareJob {
public static void main(final String[] args) throws Exception {
// set up the execution environment
final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
DataSet<AvroLineitem> lineItemFromAvro = env.createInput(
new AvroInputFormat<AvroLineitem>(new Path(args[0]), AvroLineitem.class));
DataSet<AvroLineitem> lineItemFromCsv = env.readTextFile(args[1]).map(new Prepare.AvroLineItemMapper());
DataSet<String> empty = lineItemFromAvro
.coGroup(lineItemFromCsv).where("orderKey", "partKey", "supplierKey", "lineNumber").equalTo("orderKey", "partKey", "supplierKey", "lineNumber").with(new CoGroupFunction<AvroLineitem, AvroLineitem, String>() {
@Override
public void coGroup(Iterable<AvroLineitem> avro, Iterable<AvroLineitem> csv, Collector<String> collector) throws Exception {
Iterator<AvroLineitem> aIt = avro.iterator();
if(!aIt.hasNext()) {
throw new RuntimeException("Expected item from Avro input");
}
AvroLineitem left = aIt.next();
if(aIt.hasNext()) {
throw new RuntimeException("Unexpectedly received two avro records on this side. left="+left+" next="+aIt.next());
}
Iterator<AvroLineitem> cIt = csv.iterator();
if(!cIt.hasNext()) {
throw new RuntimeException("Expected item from CSV input");
}
AvroLineitem right = cIt.next();
if(cIt.hasNext()) {
throw new RuntimeException("Unexpectedly received two CSV records on this side");
}
if(!right.equals(left)) {
throw new RuntimeException("Records are not equal");
}
}
});
empty.output(new DiscardingOutputFormat<String>());
env.execute("Compare Job");
}
}