package hip.ch3.avro; import org.apache.avro.Schema; import org.apache.avro.file.DataFileStream; import org.apache.avro.file.DataFileWriter; import org.apache.avro.generic.GenericDatumReader; import org.apache.avro.generic.GenericDatumWriter; import org.apache.avro.io.DatumReader; import org.apache.avro.io.DatumWriter; import org.apache.avro.mapred.AvroAsTextInputFormat; import org.apache.avro.mapred.AvroTextOutputFormat; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.*; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.nio.ByteBuffer; import java.util.Iterator; public class AvroTextMapReduce { public static final String[] LINES = new String[]{ "the quick brown fox jumps over the lazy dog", "the cow jumps over the moon", "the rain in spain falls mainly on the plains" }; public static void writeLinesBytesFile(OutputStream os) throws IOException { DatumWriter<ByteBuffer> writer = new GenericDatumWriter<ByteBuffer>(); DataFileWriter<ByteBuffer> out = new DataFileWriter<ByteBuffer>(writer); out.create(Schema.create(Schema.Type.BYTES), os); for (String line : LINES) { out.append(ByteBuffer.wrap(line.getBytes("UTF-8"))); } out.close(); } /** * Uses default mapper with no reduces for a map-only identity job. */ public static void main(String... args) throws Exception { JobConf job = new JobConf(); job.setJarByClass(AvroTextMapReduce.class); Path input = new Path(args[0]); Path output = new Path(args[1]); output.getFileSystem(job).delete(output, true); FileSystem hdfs = FileSystem.get(job); OutputStream os = hdfs.create(input); writeLinesBytesFile(os); FileInputFormat.setInputPaths(job, input); FileOutputFormat.setOutputPath(job, output); job.setInputFormat(AvroAsTextInputFormat.class); job.setOutputFormat(AvroTextOutputFormat.class); job.setOutputKeyClass(Text.class); job.setMapperClass(Mapper.class); job.setReducerClass(Reducer.class); JobClient.runJob(job); validateSortedFile(output.getFileSystem(job) .open(new Path(output, "part-00000.avro"))); } public static class Mapper extends MapReduceBase implements org.apache.hadoop.mapred.Mapper<Text, Text, Text, Text> { @Override public void map(Text key, Text value, OutputCollector<Text, Text> output, Reporter reporter) throws IOException { output.collect(key, value); } } public static class Reducer extends MapReduceBase implements org.apache.hadoop.mapred.Reducer<Text, Text, Text, Text> { public void reduce(Text key, Iterator<Text> values, OutputCollector<Text, Text> output, Reporter reporter) throws IOException { while (values.hasNext()) { output.collect(key, values.next()); } } } public static void validateSortedFile(InputStream is) throws Exception { DatumReader<ByteBuffer> reader = new GenericDatumReader<ByteBuffer>(); DataFileStream<ByteBuffer> lines = new DataFileStream<ByteBuffer>(is, reader); for (ByteBuffer line : lines) { byte[] b = new byte[line.remaining()]; line.get(b); System.out.println(new String(b, "UTF-8").trim()); } is.close(); } }