package com.twitter.elephantbird.util; import java.io.DataOutputStream; import java.io.IOException; import java.io.OutputStream; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.hadoop.compression.lzo.LzoIndex; import com.hadoop.compression.lzo.LzoIndexer; import com.hadoop.compression.lzo.LzopCodec; /** * Miscellaneous lzo related utilities. */ public class LzoUtils { public static final Logger LOG = LoggerFactory.getLogger(LzoUtils.class); public static final String LZO_OUTPUT_INDEXABLE_MINSIZE = "elephantbird.lzo.output.indexable.minsize"; public static final String LZO_OUTPUT_INDEX = "elephantbird.lzo.output.index"; /** * A work-around to support environments with older versions of LzopCodec. * It might not be feasible for to select right version of hadoop-lzo * in some cases. This should be removed latest by EB-3.0. */ private static boolean isLzopIndexSupported = false; static { try { isLzopIndexSupported = null != LzopCodec.class.getMethod("createIndexedOutputStream", OutputStream.class, DataOutputStream.class); } catch (Exception e) { // older version of hadoop-lzo. } } /** * Creates an lzop output stream. The index for the lzop is * also written to another at the same time if * <code>elephantbird.lzo.output.index</code> is set in configuration. <p> * * If the file size at closing is not larger than a single block, * the index file is deleted (in line with {@link LzoIndexer} behavior). */ public static DataOutputStream getIndexedLzoOutputStream(Configuration conf, Path path) throws IOException { LzopCodec codec = new LzopCodec(); codec.setConf(conf); final Path file = path; final FileSystem fs = file.getFileSystem(conf); FSDataOutputStream fileOut = fs.create(file, false); FSDataOutputStream indexOut = null; if (conf.getBoolean(LZO_OUTPUT_INDEX, false)) { if ( isLzopIndexSupported ) { Path indexPath = file.suffix(LzoIndex.LZO_TMP_INDEX_SUFFIX); indexOut = fs.create(indexPath, false); } else { LOG.warn("elephantbird.lzo.output.index is enabled, but LzopCodec " + "does not have createIndexedOutputStream method. " + "Please upgrade hadoop-lzo."); } } final boolean isIndexed = indexOut != null; final long minIndexableSize = conf.getLong(LZO_OUTPUT_INDEXABLE_MINSIZE, -1L); OutputStream out = ( isIndexed ? codec.createIndexedOutputStream(fileOut, indexOut) : codec.createOutputStream(fileOut) ); return new DataOutputStream(out) { // override close() to handle renaming index file. public void close() throws IOException { super.close(); if ( isIndexed ) { // rename or remove the index file based on file size. Path tmpPath = file.suffix(LzoIndex.LZO_TMP_INDEX_SUFFIX); FileStatus stat = fs.getFileStatus(file); final long minSizeToIndex = minIndexableSize < 0 ? stat.getBlockSize() : minIndexableSize; if (stat.getLen() <= minSizeToIndex) { fs.delete(tmpPath, false); } else { fs.rename(tmpPath, file.suffix(LzoIndex.LZO_INDEX_SUFFIX)); } } } }; } }