package com.cloudera.sa.hcu.io.recompression.nonsplittable; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import java.util.EnumSet; import java.util.zip.GZIPInputStream; import java.util.zip.ZipEntry; import java.util.zip.ZipInputStream; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.CreateFlag; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileContext; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.compress.BZip2Codec; import org.apache.hadoop.io.compress.CompressionCodec; import org.apache.hadoop.io.compress.GzipCodec; import org.apache.hadoop.io.compress.SnappyCodec; public class NonSplittableZipToSeq { public static void main(String[] args) throws IOException { if (args.length < 3) { System.out.println("NonSplittableZipToSeq Help:"); System.out .println("Parameters: <inputFilePath(s)> <outputPath> <compressionCodec>"); System.out.println(); return; } String inputLocation = args[0]; String outputLocation = args[1]; String compressionCodec = args[2]; Configuration config = new Configuration(); FileSystem hdfs = FileSystem.get(config); Path inputFilePath = new Path(inputLocation); ZipInputStream zipReader = getGzipReader(hdfs, inputFilePath); BufferedReader reader = new BufferedReader(new InputStreamReader( zipReader)); try { Text value = new Text(); String currentLine = null; long counter = 0; ZipEntry ze; while ((ze = zipReader.getNextEntry()) != null) { String entryName = ze.getName(); System.out.println("Entry Name: " + entryName + " " + ze.getSize()); Path outputFilePath = new Path(outputLocation + "/" + entryName); SequenceFile.Writer writer = getSequenceFileWriter(config, hdfs, outputFilePath, compressionCodec); try { while ((currentLine = reader.readLine()) != null) { value.set(currentLine); writer.append(NullWritable.get(), value); counter++; if (counter % 10000 == 0) { System.out.println("Processed " + counter + " lines."); } } } finally { if (writer != null) { writer.close(); writer = null; } } } System.out.println("Finished: Processed " + counter + " lines."); } finally { if (reader != null) { reader.close(); reader = null; } } } public static ZipInputStream getGzipReader(FileSystem hdfs, Path path) throws IOException { FSDataInputStream inputStream = hdfs.open(path); if (path.getName().endsWith("zip") ) { ZipInputStream zipInputStream = new ZipInputStream(inputStream); System.out.println("processing zip file"); return zipInputStream; } else { throw new IOException( "UnKnown compress type. Can only process files with ext of (zip)"); } } public static SequenceFile.Writer getSequenceFileWriter( Configuration config, FileSystem hdfs, Path path, String compressionCodecStr) throws IOException { // Created our writer SequenceFile.Metadata metaData = new SequenceFile.Metadata(); EnumSet<CreateFlag> enumSet = EnumSet.of(CreateFlag.CREATE); return SequenceFile.createWriter(FileContext.getFileContext(), config, path, NullWritable.class, Text.class, SequenceFile.CompressionType.BLOCK, getCompressionCodec(compressionCodecStr), metaData, enumSet); } public static CompressionCodec getCompressionCodec(String value) { if (value.equals("snappy")) { return new SnappyCodec(); } else if (value.equals("gzip")) { return new GzipCodec(); } else if (value.equals("bzip2")) { return new BZip2Codec(); } else { return new SnappyCodec(); } } }