package com.manning.hip.ch5;
import org.apache.avro.Schema;
import org.apache.avro.file.*;
import org.apache.avro.generic.*;
import org.apache.commons.codec.digest.DigestUtils;
import org.apache.commons.io.FileUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.*;
import org.apache.hadoop.io.IOUtils;
import java.io.*;
import java.nio.ByteBuffer;
public class SmallFilesWrite {
public static final String FIELD_FILENAME = "filename";
public static final String FIELD_CONTENTS = "contents";
private static final String SCHEMA_JSON = //<co id="ch02_smallfilewrite_comment1"/>
"{\"type\": \"record\", \"name\": \"SmallFilesTest\", "
+ "\"fields\": ["
+ "{\"name\":\"" + FIELD_FILENAME
+ "\", \"type\":\"string\"},"
+ "{\"name\":\"" + FIELD_CONTENTS
+ "\", \"type\":\"bytes\"}]}";
public static final Schema SCHEMA = Schema.parse(SCHEMA_JSON);
public static void writeToAvro(File srcPath,
OutputStream outputStream)
throws IOException {
DataFileWriter<Object> writer =
new DataFileWriter<Object>(
new GenericDatumWriter<Object>())
.setSyncInterval(100); //<co id="ch02_smallfilewrite_comment2"/>
writer.setCodec(CodecFactory.snappyCodec()); //<co id="ch02_smallfilewrite_comment3"/>
writer.create(SCHEMA, outputStream); //<co id="ch02_smallfilewrite_comment4"/>
for (Object obj : FileUtils.listFiles(srcPath, null, false)) {
File file = (File) obj;
String filename = file.getAbsolutePath();
byte content[] = FileUtils.readFileToByteArray(file);
GenericRecord record = new GenericData.Record(SCHEMA); //<co id="ch02_smallfilewrite_comment5"/>
record.put(FIELD_FILENAME, filename); //<co id="ch02_smallfilewrite_comment6"/>
record.put(FIELD_CONTENTS, ByteBuffer.wrap(content)); //<co id="ch02_smallfilewrite_comment7"/>
writer.append(record); //<co id="ch02_smallfilewrite_comment8"/>
System.out.println(
file.getAbsolutePath()
+ ": "
+ DigestUtils.md5Hex(content));
}
IOUtils.cleanup(null, writer);
IOUtils.cleanup(null, outputStream);
}
public static void main(String... args) throws Exception {
Configuration config = new Configuration();
FileSystem hdfs = FileSystem.get(config);
File sourceDir = new File(args[0]);
Path destFile = new Path(args[1]);
OutputStream os = hdfs.create(destFile);
writeToAvro(sourceDir, os);
}
}