package edu.umd.cloud9.io; import java.io.IOException; import java.math.BigInteger; import java.util.Random; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; import org.apache.commons.cli.GnuParser; import org.apache.commons.cli.HelpFormatter; import org.apache.commons.cli.OptionBuilder; import org.apache.commons.cli.Options; import org.apache.commons.cli.ParseException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileUtil; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IOUtils; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; import org.apache.hadoop.mapred.FileInputFormat; import org.apache.hadoop.mapred.FileOutputFormat; import org.apache.hadoop.mapred.JobClient; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.RunningJob; import org.apache.hadoop.mapred.SequenceFileInputFormat; import org.apache.hadoop.mapred.SequenceFileOutputFormat; import org.apache.hadoop.mapred.TextInputFormat; import org.apache.hadoop.mapred.TextOutputFormat; import org.apache.hadoop.mapred.lib.IdentityMapper; import org.apache.hadoop.mapred.lib.IdentityReducer; import org.apache.hadoop.util.GenericOptionsParser; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import org.apache.log4j.Logger; import com.google.common.base.Preconditions; public class FileMerger extends Configured implements Tool { private static final Logger sLogger = Logger.getLogger(FileMerger.class); public static final Random RANDOM_GENERATOR = new Random(); public static final int DEFAULT_RANDOM_STRING_LENGTH = 20; public static final String PATH_INDICATOR = "path"; public static final String INTEGER_INDICATOR = "int"; public static final String HELP_OPTION = "help"; public static final String INPUT_OPTION = "input"; public static final String OUTPUT_OPTION = "output"; public static final String MAPPER_OPTION = "mapper"; public static final String REDUCER_OPTION = "reducer"; public static final String MERGE = "merge-tmp-dir"; public static final String LOCAL_MERGE_OPTION = "localmerge"; public static final boolean LOCAL_MERGE = false; public static final String DELETE_SOURCE_OPTION = "deletesource"; public static final boolean DELETE_SOURCE = false; public static final String TEXT_FILE_INPUT_FORMAT = "textformat"; public static final boolean TEXT_FILE_INPUT = false; public static final String FILE_CONTENT_DELIMITER = ""; /** * Generate a random string of given length. */ public static String generateRandomString(int length) { return new BigInteger(length * 4, RANDOM_GENERATOR).toString(32); } /** * Generate a random string of default length. */ public static String generateRandomString() { return generateRandomString(DEFAULT_RANDOM_STRING_LENGTH); } /** * This method merges all files specified by the glob expression * <code>inputFiles</code> */ public static Path mergeTextFiles(Configuration configuration, String inputFiles, String outputFile, int numberOfMappers, boolean deleteSource) throws IOException { // TODO: add in configuration if (numberOfMappers <= 0) { return mergeTextFiles(configuration, inputFiles, outputFile, deleteSource, false); } else { return mergeFilesDistribute(configuration, inputFiles, outputFile, numberOfMappers, LongWritable.class, Text.class, TextInputFormat.class, TextOutputFormat.class, deleteSource, false); } } public static Path mergeTextFiles(Configuration configuration, String inputFiles, String outputFile, int numberOfMappers, boolean deleteSource, boolean deleteDestinationFileIfExist) throws IOException { if (numberOfMappers <= 0) { return mergeTextFiles(configuration, inputFiles, outputFile, deleteSource, deleteDestinationFileIfExist); } else { return mergeFilesDistribute(configuration, inputFiles, outputFile, numberOfMappers, LongWritable.class, Text.class, TextInputFormat.class, TextOutputFormat.class, deleteSource, deleteDestinationFileIfExist); } } /** * @param inputFiles a glob expression of the files to be merged * @param outputFile a destination file path * @param deleteSource delete source files after merging * @throws IOException */ private static Path mergeTextFiles(Configuration configuration, String inputFiles, String outputFile, boolean deleteSource, boolean deleteDestinationFileIfExist) throws IOException { JobConf conf = new JobConf(configuration, FileMerger.class); FileSystem fs = FileSystem.get(conf); Path inputPath = new Path(inputFiles); Path outputPath = new Path(outputFile); if (deleteDestinationFileIfExist) { if (fs.exists(outputPath)) { // carefully remove the destination file, not recursive fs.delete(outputPath, false); sLogger.info("Warning: remove destination file since it already exists..."); } } else { Preconditions.checkArgument(!fs.exists(outputPath), new IOException( "Destination file already exists...")); } FileUtil.copyMerge(fs, inputPath, fs, outputPath, deleteSource, conf, FILE_CONTENT_DELIMITER); sLogger.info("Successfully merge " + inputPath.toString() + " to " + outputFile); return outputPath; } public static Path mergeSequenceFiles(Configuration configuration, String inputFiles, String outputFile, int numberOfMappers, Class<? extends Writable> keyClass, Class<? extends Writable> valueClass, boolean deleteSource) throws IOException, InstantiationException, IllegalAccessException { if (numberOfMappers <= 0) { return mergeSequenceFiles(configuration, inputFiles, outputFile, keyClass, valueClass, deleteSource, false); } else { return mergeFilesDistribute(configuration, inputFiles, outputFile, numberOfMappers, keyClass, valueClass, SequenceFileInputFormat.class, SequenceFileOutputFormat.class, deleteSource, false); } } public static Path mergeSequenceFiles(Configuration configuration, String inputFiles, String outputFile, int numberOfMappers, Class<? extends Writable> keyClass, Class<? extends Writable> valueClass, boolean deleteSource, boolean deleteDestinationFileIfExist) throws IOException, InstantiationException, IllegalAccessException { if (numberOfMappers <= 0) { return mergeSequenceFiles(configuration, inputFiles, outputFile, keyClass, valueClass, deleteSource, deleteDestinationFileIfExist); } else { return mergeFilesDistribute(configuration, inputFiles, outputFile, numberOfMappers, keyClass, valueClass, SequenceFileInputFormat.class, SequenceFileOutputFormat.class, deleteSource, deleteDestinationFileIfExist); } } private static Path mergeSequenceFiles(Configuration configuration, String inputFiles, String outputFile, Class<? extends Writable> keyClass, Class<? extends Writable> valueClass, boolean deleteSource, boolean deleteDestinationFileIfExist) throws IOException, InstantiationException, IllegalAccessException { JobConf conf = new JobConf(configuration, FileMerger.class); FileSystem fs = FileSystem.get(conf); Path inputPath = new Path(inputFiles); Path outputPath = new Path(outputFile); if (deleteDestinationFileIfExist) { if (fs.exists(outputPath)) { // carefully remove the destination file, not recursive fs.delete(outputPath, false); sLogger.info("Warning: remove destination file since it already exists..."); } } else { Preconditions.checkArgument(!fs.exists(outputPath), new IOException( "Destination file already exists...")); } FileStatus[] fileStatuses = fs.globStatus(inputPath); SequenceFile.Reader sequenceFileReader = null; SequenceFile.Writer sequenceFileWriter = null; Writable key, value; key = keyClass.newInstance(); value = valueClass.newInstance(); try { sequenceFileWriter = new SequenceFile.Writer(fs, conf, outputPath, keyClass, valueClass); for (FileStatus fileStatus : fileStatuses) { sLogger.info("Openning file " + fileStatus.getPath() + "..."); sequenceFileReader = new SequenceFile.Reader(fs, fileStatus.getPath(), conf); while (sequenceFileReader.next(key, value)) { sequenceFileWriter.append(key, value); } if (deleteSource) { fs.deleteOnExit(fileStatus.getPath()); } } } finally { IOUtils.closeStream(sequenceFileReader); IOUtils.closeStream(sequenceFileWriter); } sLogger.info("Successfully merge " + inputPath.toString() + " to " + outputFile); return outputPath; } private static Path mergeFilesDistribute(Configuration configuration, String inputFiles, String outputFile, int numberOfMappers, Class<? extends Writable> keyClass, Class<? extends Writable> valueClass, Class<? extends FileInputFormat> fileInputClass, Class<? extends FileOutputFormat> fileOutputClass, boolean deleteSource, boolean deleteDestinationFileIfExist) throws IOException { JobConf conf = new JobConf(configuration, FileMerger.class); conf.setJobName(FileMerger.class.getSimpleName()); FileSystem fs = FileSystem.get(conf); sLogger.info("Tool: " + FileMerger.class.getSimpleName()); sLogger.info(" - merge files from: " + inputFiles); sLogger.info(" - merge files to: " + outputFile); conf.setNumMapTasks(numberOfMappers); conf.setNumReduceTasks(1); conf.setMapperClass(IdentityMapper.class); conf.setReducerClass(IdentityReducer.class); conf.setMapOutputKeyClass(keyClass); conf.setMapOutputValueClass(valueClass); conf.setOutputKeyClass(keyClass); conf.setOutputValueClass(valueClass); conf.setInputFormat(fileInputClass); conf.setOutputFormat(fileOutputClass); Path inputPath = new Path(inputFiles); Path mergePath = new Path(inputPath.getParent().toString() + Path.SEPARATOR + MERGE + generateRandomString()); Preconditions.checkArgument(!fs.exists(mergePath), new IOException( "Intermediate merge directory already exists...")); Path outputPath = new Path(outputFile); if (deleteDestinationFileIfExist) { if (fs.exists(outputPath)) { // carefully remove the destination file, not recursive fs.delete(outputPath, false); sLogger.info("Warning: remove destination file since it already exists..."); } } else { Preconditions.checkArgument(!fs.exists(outputPath), new IOException( "Destination file already exists...")); } FileInputFormat.setInputPaths(conf, inputPath); FileOutputFormat.setOutputPath(conf, mergePath); FileOutputFormat.setCompressOutput(conf, true); try { long startTime = System.currentTimeMillis(); RunningJob job = JobClient.runJob(conf); sLogger.info("Merge Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); fs.rename(new Path(mergePath.toString() + Path.SEPARATOR + "part-00000"), outputPath); if (deleteSource) { for (FileStatus fileStatus : fs.globStatus(inputPath)) { fs.deleteOnExit(fileStatus.getPath()); } } } finally { fs.delete(mergePath, true); } sLogger.info("Successfully merge " + inputFiles.toString() + " to " + outputFile); return outputPath; } @Override /** * TODO: add in hadoop configuration */ public int run(String[] args) throws IOException { Options options = new Options(); options.addOption(HELP_OPTION, false, "print the help message"); options.addOption(OptionBuilder.withArgName(PATH_INDICATOR).hasArg() .withDescription("input file or directory").create(INPUT_OPTION)); options.addOption(OptionBuilder.withArgName(PATH_INDICATOR).hasArg() .withDescription("output file").create(OUTPUT_OPTION)); options .addOption(OptionBuilder .withArgName(INTEGER_INDICATOR) .hasArg() .withDescription( "number of mappers (default to 0 and hence local merge mode, set to positive value to enable cluster merge mode)") .create(MAPPER_OPTION)); options.addOption(OptionBuilder.withArgName("property=value").hasArgs(2).withValueSeparator() .withDescription("assign value for given property").create("D")); options.addOption(TEXT_FILE_INPUT_FORMAT, false, "input file in sequence format"); options.addOption(DELETE_SOURCE_OPTION, false, "delete sources after merging"); int mapperTasks = 0; boolean deleteSource = DELETE_SOURCE; boolean textFileFormat = TEXT_FILE_INPUT; String inputPath = ""; String outputPath = ""; GenericOptionsParser genericOptionsParser = new GenericOptionsParser(args); Configuration configuration = genericOptionsParser.getConfiguration(); CommandLineParser parser = new GnuParser(); HelpFormatter formatter = new HelpFormatter(); try { CommandLine line = parser.parse(options, args); if (line.hasOption(HELP_OPTION)) { formatter.printHelp(FileMerger.class.getName(), options); System.exit(0); } if (line.hasOption(INPUT_OPTION)) { inputPath = line.getOptionValue(INPUT_OPTION); } else { throw new ParseException("Parsing failed due to " + INPUT_OPTION + " not initialized..."); } if (line.hasOption(OUTPUT_OPTION)) { outputPath = line.getOptionValue(OUTPUT_OPTION); } else { throw new ParseException("Parsing failed due to " + OUTPUT_OPTION + " not initialized..."); } if (line.hasOption(MAPPER_OPTION)) { mapperTasks = Integer.parseInt(line.getOptionValue(MAPPER_OPTION)); if (mapperTasks <= 0) { sLogger.info("Warning: " + MAPPER_OPTION + " is not positive, merge in local model..."); mapperTasks = 0; } } if (line.hasOption(DELETE_SOURCE_OPTION)) { deleteSource = true; } if (line.hasOption(TEXT_FILE_INPUT_FORMAT)) { textFileFormat = true; } } catch (ParseException pe) { System.err.println(pe.getMessage()); formatter.printHelp(FileMerger.class.getName(), options); System.exit(0); } catch (NumberFormatException nfe) { System.err.println(nfe.getMessage()); System.exit(0); } try { merge(configuration, inputPath, outputPath, mapperTasks, textFileFormat, deleteSource); } catch (InstantiationException ie) { ie.printStackTrace(); } catch (IllegalAccessException iae) { iae.printStackTrace(); } return 0; } @SuppressWarnings("unchecked") public static Path merge(Configuration configuration, String inputPath, String outputPath, int mapperTasks, boolean textFileFormat, boolean deleteSource) throws IOException, InstantiationException, IllegalAccessException { Class<? extends Writable> keyClass = LongWritable.class; Class<? extends Writable> valueClass = Text.class; FileSystem fs = FileSystem.get(new Configuration()); if (!textFileFormat) { FileStatus[] fileStatus = fs.globStatus(new Path(inputPath)); Preconditions.checkArgument(fileStatus.length > 0, "Invalid input path..."); SequenceFile.Reader reader = new SequenceFile.Reader(fs, fileStatus[fileStatus.length - 1].getPath(), fs.getConf()); try { keyClass = (Class<? extends Writable>) reader.getKeyClass(); valueClass = (Class<? extends Writable>) reader.getValueClass(); sLogger.info("Key type: " + keyClass.toString()); sLogger.info("Value type: " + valueClass.toString()); } catch (Exception e) { throw new RuntimeException("Error in loading key/value class"); } reader.close(); } if (textFileFormat) { return mergeTextFiles(configuration, inputPath, outputPath, mapperTasks, deleteSource); } else { return mergeSequenceFiles(configuration, inputPath, outputPath, mapperTasks, keyClass, valueClass, deleteSource); } } public static void main(String[] args) throws Exception { int res = ToolRunner.run(new Configuration(), new FileMerger(), args); System.exit(res); } }