package com.cloudera.sa.hcu.io.recompression.nonsplittable; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import java.util.EnumSet; import java.util.zip.GZIPInputStream; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.CreateFlag; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileContext; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.compress.BZip2Codec; import org.apache.hadoop.io.compress.CompressionCodec; import org.apache.hadoop.io.compress.GzipCodec; import org.apache.hadoop.io.compress.SnappyCodec; public class NonSplittableGzipToSeq { public static void main(String[] args) throws IOException { if (args.length < 3) { System.out.println("NonSplittableGzipToSeq Help:"); System.out.println("Parameters: <inputFilePath(s)> <outputPath> <compressionCodec>"); System.out.println(); return; } String inputLocation = args[0]; String outputLocation = args[1]; String compressionCodec = args[2]; Configuration config = new Configuration(); FileSystem hdfs = FileSystem.get(config); Path inputFilePath = new Path(inputLocation); Path outputFilePath = new Path(outputLocation); BufferedReader reader = getGzipReader(hdfs, inputFilePath); try { SequenceFile.Writer writer = getSequenceFileWriter(config, hdfs, outputFilePath, compressionCodec); try { Text value = new Text(); String currentLine = null; long counter = 0; while((currentLine = reader.readLine()) != null) { value.set(currentLine); writer.append(NullWritable.get(), value); counter++; if (counter % 10000 == 0) { System.out.println("Processed " + counter + " lines."); } } System.out.println("Finished: Processed " + counter + " lines."); } finally { if (writer != null) { writer.close(); writer = null; } } }finally { if (reader != null) { reader.close(); reader = null; } } } public static BufferedReader getGzipReader(FileSystem hdfs, Path path) throws IOException { FSDataInputStream inputStream = hdfs.open(path); if (path.getName().endsWith("gz") || path.getName().endsWith("gzip")) { GZIPInputStream gzip = new GZIPInputStream(inputStream); System.out.println("processing gzip file"); return new BufferedReader(new InputStreamReader(gzip)); } else { throw new IOException("UnKnown compress type. Can only process files with ext of (gzip, gz)"); } } public static SequenceFile.Writer getSequenceFileWriter(Configuration config, FileSystem hdfs, Path path, String compressionCodecStr) throws IOException { //Created our writer SequenceFile.Metadata metaData = new SequenceFile.Metadata(); EnumSet<CreateFlag> enumSet = EnumSet.of(CreateFlag.CREATE); return SequenceFile.createWriter( FileContext.getFileContext(), config, path, NullWritable.class, Text.class, SequenceFile.CompressionType.BLOCK, getCompressionCodec(compressionCodecStr), metaData, enumSet); } public static CompressionCodec getCompressionCodec(String value) { if (value.equals("snappy")) { return new SnappyCodec(); }else if (value.equals("gzip")) { return new GzipCodec(); }else if (value.equals("bzip2")) { return new BZip2Codec(); }else { return new SnappyCodec(); } } }