package hip.ch4; import org.apache.avro.Schema; import org.apache.avro.file.CodecFactory; import org.apache.avro.file.DataFileWriter; import org.apache.avro.generic.GenericData; import org.apache.avro.generic.GenericDatumWriter; import org.apache.avro.generic.GenericRecord; import org.apache.commons.codec.digest.DigestUtils; import org.apache.commons.io.FileUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IOUtils; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import java.io.File; import java.io.IOException; import java.io.OutputStream; import java.nio.ByteBuffer; public class SmallFilesWrite extends Configured implements Tool { /** * Main entry point for the example. * * @param args arguments * @throws Exception when something goes wrong */ public static void main(final String[] args) throws Exception { int res = ToolRunner.run(new Configuration(), new SmallFilesWrite(), args); System.exit(res); } /** * Write the file. * * @param args the command-line arguments * @return the process exit code * @throws Exception if something goes wrong */ public int run(final String[] args) throws Exception { FileSystem hdfs = FileSystem.get(getConf()); File sourceDir = new File(args[0]); Path destFile = new Path(args[1]); OutputStream os = hdfs.create(destFile); writeToAvro(sourceDir, os); return 0; } public static final String FIELD_FILENAME = "filename"; public static final String FIELD_CONTENTS = "contents"; private static final String SCHEMA_JSON = //<co id="ch02_smallfilewrite_comment1"/> "{\"type\": \"record\", \"name\": \"SmallFilesTest\", " + "\"fields\": [" + "{\"name\":\"" + FIELD_FILENAME + "\", \"type\":\"string\"}," + "{\"name\":\"" + FIELD_CONTENTS + "\", \"type\":\"bytes\"}]}"; public static final Schema SCHEMA = Schema.parse(SCHEMA_JSON); public static void writeToAvro(File srcPath, OutputStream outputStream) throws IOException { DataFileWriter<Object> writer = new DataFileWriter<Object>( new GenericDatumWriter<Object>()) .setSyncInterval(100); //<co id="ch02_smallfilewrite_comment2"/> writer.setCodec(CodecFactory.snappyCodec()); //<co id="ch02_smallfilewrite_comment3"/> writer.create(SCHEMA, outputStream); //<co id="ch02_smallfilewrite_comment4"/> for (Object obj : FileUtils.listFiles(srcPath, null, false)) { File file = (File) obj; String filename = file.getAbsolutePath(); byte content[] = FileUtils.readFileToByteArray(file); GenericRecord record = new GenericData.Record(SCHEMA); //<co id="ch02_smallfilewrite_comment5"/> record.put(FIELD_FILENAME, filename); //<co id="ch02_smallfilewrite_comment6"/> record.put(FIELD_CONTENTS, ByteBuffer.wrap(content)); //<co id="ch02_smallfilewrite_comment7"/> writer.append(record); //<co id="ch02_smallfilewrite_comment8"/> System.out.println( file.getAbsolutePath() + ": " + DigestUtils.md5Hex(content)); } IOUtils.cleanup(null, writer); IOUtils.cleanup(null, outputStream); } }