package hip.ch4; import org.apache.avro.file.DataFileStream; import org.apache.avro.generic.GenericDatumReader; import org.apache.avro.generic.GenericRecord; import org.apache.commons.codec.digest.DigestUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IOUtils; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import java.io.IOException; import java.io.InputStream; import java.nio.ByteBuffer; public class SmallFilesRead extends Configured implements Tool { /** * Main entry point for the example. * * @param args arguments * @throws Exception when something goes wrong */ public static void main(final String[] args) throws Exception { int res = ToolRunner.run(new Configuration(), new SmallFilesRead(), args); System.exit(res); } /** * Read the file. * * @param args the command-line arguments * @return the process exit code * @throws Exception if something goes wrong */ public int run(final String[] args) throws Exception { FileSystem hdfs = FileSystem.get(getConf()); Path destFile = new Path(args[0]); InputStream is = hdfs.open(destFile); readFromAvro(is); return 0; } private static final String FIELD_FILENAME = "filename"; private static final String FIELD_CONTENTS = "contents"; public static void readFromAvro(InputStream is) throws IOException { DataFileStream<Object> reader = //<co id="ch02_smallfileread_comment1"/> new DataFileStream<Object>( is, new GenericDatumReader<Object>()); for (Object o : reader) { //<co id="ch02_smallfileread_comment2"/> GenericRecord r = (GenericRecord) o; //<co id="ch02_smallfileread_comment3"/> System.out.println( //<co id="ch02_smallfileread_comment4"/> r.get(FIELD_FILENAME) + ": " + DigestUtils.md5Hex( ((ByteBuffer) r.get(FIELD_CONTENTS)).array())); } IOUtils.cleanup(null, is); IOUtils.cleanup(null, reader); } }