/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.mapred.gridmix; import java.io.DataOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.util.HashMap; import java.util.Map; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.compress.CodecPool; import org.apache.hadoop.io.compress.CompressionCodec; import org.apache.hadoop.io.compress.CompressionCodecFactory; import org.apache.hadoop.io.compress.CompressionInputStream; import org.apache.hadoop.io.compress.Decompressor; import org.apache.hadoop.io.compress.GzipCodec; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.Utils; import org.apache.hadoop.mapred.gridmix.GenerateData.DataStatistics; import org.apache.hadoop.mapred.gridmix.GenerateData.GenDataFormat; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.util.ReflectionUtils; import org.apache.hadoop.util.StringUtils; /** * This is a utility class for all the compression related modules. */ class CompressionEmulationUtil { static final Log LOG = LogFactory.getLog(CompressionEmulationUtil.class); /** * Enable compression usage in GridMix runs. */ private static final String COMPRESSION_EMULATION_ENABLE = "gridmix.compression-emulation.enable"; /** * Enable input data decompression. */ private static final String INPUT_DECOMPRESSION_EMULATION_ENABLE = "gridmix.compression-emulation.input-decompression.enable"; /** * Configuration property for setting the compression ratio for map input * data. */ private static final String GRIDMIX_MAP_INPUT_COMPRESSION_RATIO = "gridmix.compression-emulation.map-input.decompression-ratio"; /** * Configuration property for setting the compression ratio of map output. */ private static final String GRIDMIX_MAP_OUTPUT_COMPRESSION_RATIO = "gridmix.compression-emulation.map-output.compression-ratio"; /** * Configuration property for setting the compression ratio of reduce output. */ private static final String GRIDMIX_REDUCE_OUTPUT_COMPRESSION_RATIO = "gridmix.compression-emulation.reduce-output.compression-ratio"; /** * Default compression ratio. */ static final float DEFAULT_COMPRESSION_RATIO = 0.5F; private static final CompressionRatioLookupTable COMPRESSION_LOOKUP_TABLE = new CompressionRatioLookupTable(); /** * This is a {@link Mapper} implementation for generating random text data. * It uses {@link RandomTextDataGenerator} for generating text data and the * output files are compressed. */ public static class RandomTextDataMapper extends Mapper<NullWritable, LongWritable, Text, Text> { private RandomTextDataGenerator rtg; @Override protected void setup(Context context) throws IOException, InterruptedException { Configuration conf = context.getConfiguration(); int listSize = RandomTextDataGenerator.getRandomTextDataGeneratorListSize(conf); int wordSize = RandomTextDataGenerator.getRandomTextDataGeneratorWordSize(conf); rtg = new RandomTextDataGenerator(listSize, wordSize); } /** * Emits random words sequence of desired size. Note that the desired output * size is passed as the value parameter to this map. */ @Override public void map(NullWritable key, LongWritable value, Context context) throws IOException, InterruptedException { //TODO Control the extra data written .. //TODO Should the key\tvalue\n be considered for measuring size? // Can counters like BYTES_WRITTEN be used? What will be the value of // such counters in LocalJobRunner? for (long bytes = value.get(); bytes > 0;) { String randomKey = rtg.getRandomWord(); String randomValue = rtg.getRandomWord(); context.write(new Text(randomKey), new Text(randomValue)); bytes -= (randomValue.getBytes().length + randomKey.getBytes().length); } } } /** * Configure the {@link Job} for enabling compression emulation. */ static void configure(final Job job) throws IOException, InterruptedException, ClassNotFoundException { // set the random text mapper job.setMapperClass(RandomTextDataMapper.class); job.setNumReduceTasks(0); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setInputFormatClass(GenDataFormat.class); job.setJarByClass(GenerateData.class); // set the output compression true FileOutputFormat.setCompressOutput(job, true); try { FileInputFormat.addInputPath(job, new Path("ignored")); } catch (IOException e) { LOG.error("Error while adding input path ", e); } } /** * This is the lookup table for mapping compression ratio to the size of the * word in the {@link RandomTextDataGenerator}'s dictionary. * * Note that this table is computed (empirically) using a dictionary of * default length i.e {@value RandomTextDataGenerator#DEFAULT_LIST_SIZE}. */ private static class CompressionRatioLookupTable { private static Map<Float, Integer> map = new HashMap<Float, Integer>(60); private static final float MIN_RATIO = 0.07F; private static final float MAX_RATIO = 0.68F; // add the empirically obtained data points in the lookup table CompressionRatioLookupTable() { map.put(.07F,30); map.put(.08F,25); map.put(.09F,60); map.put(.10F,20); map.put(.11F,70); map.put(.12F,15); map.put(.13F,80); map.put(.14F,85); map.put(.15F,90); map.put(.16F,95); map.put(.17F,100); map.put(.18F,105); map.put(.19F,110); map.put(.20F,115); map.put(.21F,120); map.put(.22F,125); map.put(.23F,130); map.put(.24F,140); map.put(.25F,145); map.put(.26F,150); map.put(.27F,155); map.put(.28F,160); map.put(.29F,170); map.put(.30F,175); map.put(.31F,180); map.put(.32F,190); map.put(.33F,195); map.put(.34F,205); map.put(.35F,215); map.put(.36F,225); map.put(.37F,230); map.put(.38F,240); map.put(.39F,250); map.put(.40F,260); map.put(.41F,270); map.put(.42F,280); map.put(.43F,295); map.put(.44F,310); map.put(.45F,325); map.put(.46F,335); map.put(.47F,355); map.put(.48F,375); map.put(.49F,395); map.put(.50F,420); map.put(.51F,440); map.put(.52F,465); map.put(.53F,500); map.put(.54F,525); map.put(.55F,550); map.put(.56F,600); map.put(.57F,640); map.put(.58F,680); map.put(.59F,734); map.put(.60F,813); map.put(.61F,905); map.put(.62F,1000); map.put(.63F,1055); map.put(.64F,1160); map.put(.65F,1355); map.put(.66F,1510); map.put(.67F,1805); map.put(.68F,2170); } /** * Returns the size of the word in {@link RandomTextDataGenerator}'s * dictionary that can generate text with the desired compression ratio. * * @throws RuntimeException If ratio is less than {@value #MIN_RATIO} or * greater than {@value #MAX_RATIO}. */ int getWordSizeForRatio(float ratio) { ratio = standardizeCompressionRatio(ratio); if (ratio >= MIN_RATIO && ratio <= MAX_RATIO) { return map.get(ratio); } else { throw new RuntimeException("Compression ratio should be in the range [" + MIN_RATIO + "," + MAX_RATIO + "]. Configured compression ratio is " + ratio + "."); } } } /** * Setup the data generator's configuration to generate compressible random * text data with the desired compression ratio. * Note that the compression ratio, if configured, will set the * {@link RandomTextDataGenerator}'s list-size and word-size based on * empirical values using the compression ratio set in the configuration. * * Hence to achieve the desired compression ratio, * {@link RandomTextDataGenerator}'s list-size will be set to the default * value i.e {@value RandomTextDataGenerator#DEFAULT_LIST_SIZE}. */ static void setupDataGeneratorConfig(Configuration conf) { boolean compress = isCompressionEmulationEnabled(conf); if (compress) { float ratio = getMapInputCompressionEmulationRatio(conf); LOG.info("GridMix is configured to generate compressed input data with " + " a compression ratio of " + ratio); int wordSize = COMPRESSION_LOOKUP_TABLE.getWordSizeForRatio(ratio); RandomTextDataGenerator.setRandomTextDataGeneratorWordSize(conf, wordSize); // since the compression ratios are computed using the default value of // list size RandomTextDataGenerator.setRandomTextDataGeneratorListSize(conf, RandomTextDataGenerator.DEFAULT_LIST_SIZE); } } /** * Returns a {@link RandomTextDataGenerator} that generates random * compressible text with the desired compression ratio. */ static RandomTextDataGenerator getRandomTextDataGenerator(float ratio, long seed) { int wordSize = COMPRESSION_LOOKUP_TABLE.getWordSizeForRatio(ratio); RandomTextDataGenerator rtg = new RandomTextDataGenerator(RandomTextDataGenerator.DEFAULT_LIST_SIZE, seed, wordSize); return rtg; } /** Publishes compression related data statistics. Following statistics are * published * <ul> * <li>Total compressed input data size</li> * <li>Number of compressed input data files</li> * <li>Compression Ratio</li> * <li>Text data dictionary size</li> * <li>Random text word size</li> * </ul> */ static DataStatistics publishCompressedDataStatistics(Path inputDir, Configuration conf, long uncompressedDataSize) throws IOException { FileSystem fs = inputDir.getFileSystem(conf); CompressionCodecFactory compressionCodecs = new CompressionCodecFactory(conf); // iterate over compressed files and sum up the compressed file sizes long compressedDataSize = 0; int numCompressedFiles = 0; // obtain input data file statuses FileStatus[] outFileStatuses = fs.listStatus(inputDir, new Utils.OutputFileUtils.OutputFilesFilter()); for (FileStatus status : outFileStatuses) { // check if the input file is compressed if (compressionCodecs != null) { CompressionCodec codec = compressionCodecs.getCodec(status.getPath()); if (codec != null) { ++numCompressedFiles; compressedDataSize += status.getLen(); } } } LOG.info("Gridmix is configured to use compressed input data."); // publish the input data size LOG.info("Total size of compressed input data : " + StringUtils.humanReadableInt(compressedDataSize)); LOG.info("Total number of compressed input data files : " + numCompressedFiles); if (numCompressedFiles == 0) { throw new RuntimeException("No compressed file found in the input" + " directory : " + inputDir.toString() + ". To enable compression" + " emulation, run Gridmix either with " + " an input directory containing compressed input file(s) or" + " use the -generate option to (re)generate it. If compression" + " emulation is not desired, disable it by setting '" + COMPRESSION_EMULATION_ENABLE + "' to 'false'."); } // publish compression ratio only if its generated in this gridmix run if (uncompressedDataSize > 0) { // compute the compression ratio double ratio = ((double)compressedDataSize) / uncompressedDataSize; // publish the compression ratio LOG.info("Input Data Compression Ratio : " + ratio); } return new DataStatistics(compressedDataSize, numCompressedFiles, true); } /** * Enables/Disables compression emulation. * @param conf Target configuration where the parameter * {@value #COMPRESSION_EMULATION_ENABLE} will be set. * @param val The value to be set. */ static void setCompressionEmulationEnabled(Configuration conf, boolean val) { conf.setBoolean(COMPRESSION_EMULATION_ENABLE, val); } /** * Checks if compression emulation is enabled or not. Default is {@code true}. */ static boolean isCompressionEmulationEnabled(Configuration conf) { return conf.getBoolean(COMPRESSION_EMULATION_ENABLE, true); } /** * Enables/Disables input decompression emulation. * @param conf Target configuration where the parameter * {@value #INPUT_DECOMPRESSION_EMULATION_ENABLE} will be set. * @param val The value to be set. */ static void setInputCompressionEmulationEnabled(Configuration conf, boolean val) { conf.setBoolean(INPUT_DECOMPRESSION_EMULATION_ENABLE, val); } /** * Check if input decompression emulation is enabled or not. * Default is {@code false}. */ static boolean isInputCompressionEmulationEnabled(Configuration conf) { return conf.getBoolean(INPUT_DECOMPRESSION_EMULATION_ENABLE, false); } /** * Set the map input data compression ratio in the given conf. */ static void setMapInputCompressionEmulationRatio(Configuration conf, float ratio) { conf.setFloat(GRIDMIX_MAP_INPUT_COMPRESSION_RATIO, ratio); } /** * Get the map input data compression ratio using the given configuration. * If the compression ratio is not set in the configuration then use the * default value i.e {@value #DEFAULT_COMPRESSION_RATIO}. */ static float getMapInputCompressionEmulationRatio(Configuration conf) { return conf.getFloat(GRIDMIX_MAP_INPUT_COMPRESSION_RATIO, DEFAULT_COMPRESSION_RATIO); } /** * Set the map output data compression ratio in the given configuration. */ static void setMapOutputCompressionEmulationRatio(Configuration conf, float ratio) { conf.setFloat(GRIDMIX_MAP_OUTPUT_COMPRESSION_RATIO, ratio); } /** * Get the map output data compression ratio using the given configuration. * If the compression ratio is not set in the configuration then use the * default value i.e {@value #DEFAULT_COMPRESSION_RATIO}. */ static float getMapOutputCompressionEmulationRatio(Configuration conf) { return conf.getFloat(GRIDMIX_MAP_OUTPUT_COMPRESSION_RATIO, DEFAULT_COMPRESSION_RATIO); } /** * Set the reduce output data compression ratio in the given configuration. */ static void setReduceOutputCompressionEmulationRatio(Configuration conf, float ratio) { conf.setFloat(GRIDMIX_REDUCE_OUTPUT_COMPRESSION_RATIO, ratio); } /** * Get the reduce output data compression ratio using the given configuration. * If the compression ratio is not set in the configuration then use the * default value i.e {@value #DEFAULT_COMPRESSION_RATIO}. */ static float getReduceOutputCompressionEmulationRatio(Configuration conf) { return conf.getFloat(GRIDMIX_REDUCE_OUTPUT_COMPRESSION_RATIO, DEFAULT_COMPRESSION_RATIO); } /** * Standardize the compression ratio i.e round off the compression ratio to * only 2 significant digits. */ static float standardizeCompressionRatio(float ratio) { // round off to 2 significant digits int significant = (int)Math.round(ratio * 100); return ((float)significant)/100; } /** * Returns a {@link InputStream} for a file that might be compressed. */ static InputStream getPossiblyDecompressedInputStream(Path file, Configuration conf, long offset) throws IOException { FileSystem fs = file.getFileSystem(conf); if (isCompressionEmulationEnabled(conf) && isInputCompressionEmulationEnabled(conf)) { CompressionCodecFactory compressionCodecs = new CompressionCodecFactory(conf); CompressionCodec codec = compressionCodecs.getCodec(file); if (codec != null) { Decompressor decompressor = CodecPool.getDecompressor(codec); if (decompressor != null) { CompressionInputStream in = codec.createInputStream(fs.open(file), decompressor); //TODO Seek doesnt work with compressed input stream. // Use SplittableCompressionCodec? return (InputStream)in; } } } FSDataInputStream in = fs.open(file); in.seek(offset); return (InputStream)in; } /** * Returns a {@link OutputStream} for a file that might need * compression. */ static OutputStream getPossiblyCompressedOutputStream(Path file, Configuration conf) throws IOException { FileSystem fs = file.getFileSystem(conf); JobConf jConf = new JobConf(conf); if (org.apache.hadoop.mapred.FileOutputFormat.getCompressOutput(jConf)) { // get the codec class Class<? extends CompressionCodec> codecClass = org.apache.hadoop.mapred.FileOutputFormat .getOutputCompressorClass(jConf, GzipCodec.class); // get the codec implementation CompressionCodec codec = ReflectionUtils.newInstance(codecClass, conf); // add the appropriate extension file = file.suffix(codec.getDefaultExtension()); if (isCompressionEmulationEnabled(conf)) { FSDataOutputStream fileOut = fs.create(file, false); return new DataOutputStream(codec.createOutputStream(fileOut)); } } return fs.create(file, false); } /** * Extracts compression/decompression related configuration parameters from * the source configuration to the target configuration. */ static void configureCompressionEmulation(Configuration source, Configuration target) { // enable output compression target.setBoolean("mapred.output.compress", source.getBoolean("mapred.output.compress", false)); // set the job output compression codec String jobOutputCompressionCodec = source.get("mapred.output.compression.codec"); if (jobOutputCompressionCodec != null) { target.set("mapred.output.compression.codec", jobOutputCompressionCodec); } // set the job output compression type String jobOutputCompressionType = source.get("mapred.output.compression.type"); if (jobOutputCompressionType != null) { target.set("mapred.output.compression.type", jobOutputCompressionType); } // enable map output compression target.setBoolean("mapred.compress.map.output", source.getBoolean("mapred.compress.map.output", false)); // set the map output compression codecs String mapOutputCompressionCodec = source.get("mapred.map.output.compression.codec"); if (mapOutputCompressionCodec != null) { target.set("mapred.map.output.compression.codec", mapOutputCompressionCodec); } // enable input decompression //TODO replace with mapInputBytes and hdfsBytesRead Path[] inputs = org.apache.hadoop.mapred.FileInputFormat .getInputPaths(new JobConf(source)); boolean needsCompressedInput = false; CompressionCodecFactory compressionCodecs = new CompressionCodecFactory(source); for (Path input : inputs) { CompressionCodec codec = compressionCodecs.getCodec(input); if (codec != null) { needsCompressedInput = true; } } setInputCompressionEmulationEnabled(target, needsCompressedInput); } }