package com.cloudera.sa.hcu.io.get;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import org.apache.avro.Schema;
import org.apache.avro.file.DataFileStream;
import org.apache.avro.generic.GenericDatumReader;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.io.DatumReader;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
public class GetAvroFile extends AbstractGetter
{
public static void main(String[] args) throws Exception
{
(new GetAvroFile()).getFile(args);
}
public void getFile(String[] args) throws IOException
{
if (args.length < 2)
{
System.out.println("Get Avro File:");
System.out.println("");
System.out.println("Parameter: <hdfs input file path> <local output data file path> <optionally local output schema file path>");
return;
}
String inputLocation = args[0];
String outputLocation = args[1];
String schemaOutputLocation = null;
if (args.length == 3)
{
schemaOutputLocation = args[2];
}
Configuration config = new Configuration();
FileSystem hdfs = FileSystem.get(config);
Path inputFilePath = new Path(inputLocation);
FSDataInputStream dataInputStream = hdfs.open(inputFilePath);
DatumReader<GenericRecord> reader = new GenericDatumReader<GenericRecord>();
// writer.setSchema(s); // I guess I don't need this
DataFileStream<GenericRecord> dataFileReader = new DataFileStream<GenericRecord>(dataInputStream, reader);
BufferedWriter localDataWriter = createBufferedWriter(outputLocation);
try
{
Schema s = dataFileReader.getSchema();
if (schemaOutputLocation != null)
{
BufferedWriter localSchemaWriter = new BufferedWriter(new FileWriter(new File(schemaOutputLocation)));
try
{
localSchemaWriter.write(s.toString());
} finally
{
localSchemaWriter.close();
}
}
while (dataFileReader.hasNext())
{
GenericRecord record = dataFileReader.next();
localDataWriter.write(record.toString());
onWritenRecord();
}
} finally
{
localDataWriter.close();
dataFileReader.close();
dataInputStream.close();
}
onFinishedWriting();
}
}