package com.yahoo.glimmer.util;
/*
* Copyright (c) 2012 Yahoo! Inc. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License.
* You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing, software distributed under the License is
* distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and limitations under the License.
* See accompanying LICENSE file.
*/
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.CompressionCodecFactory;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import com.martiansoftware.jsap.FlaggedOption;
import com.martiansoftware.jsap.JSAP;
import com.martiansoftware.jsap.JSAPResult;
import com.martiansoftware.jsap.Parameter;
import com.martiansoftware.jsap.SimpleJSAP;
public class MergeSortTool extends Configured implements Tool {
private static final Log LOG = LogFactory.getLog(MergeSortTool.class);
private static final String OUTPUT_ARG = "output";
private static final String INPUT_ARG = "input";
private static final String COUNT_ARG = "count";
public static void main(String[] args) throws Exception {
int ret = ToolRunner.run(new MergeSortTool(), args);
System.exit(ret);
}
@Override
public int run(String[] args) throws Exception {
SimpleJSAP jsap = new SimpleJSAP(MergeSortTool.class.getName(), "Merges alpha numerically sorted text files on HDFS", new Parameter[] {
new FlaggedOption(INPUT_ARG, JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, 'i', INPUT_ARG, "input filenames glob eg. .../part-r-?????/sortedlines.text"),
new FlaggedOption(OUTPUT_ARG, JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, 'o', OUTPUT_ARG, "output filename"),
new FlaggedOption(COUNT_ARG, JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'c', COUNT_ARG, "optionally create a file containing a count of the number of lines merged in text"),
});
JSAPResult jsapResult = jsap.parse(args);
if (!jsapResult.success()) {
System.err.print(jsap.getUsage());
System.exit(1);
}
// FileSystem fs = FileSystem.get(getConf());
// CompressionCodecFactory factory = new
// CompressionCodecFactory(getConf());
// mergeSort(fs, sourcePaths, outputPath, factory);
// Maybe quicker to use a MR job with one reducer.. Currently
// decompression, merge and compression are all done in this thread..
Path inputGlobPath = new Path(jsapResult.getString(INPUT_ARG));
Configuration config = getConf();
FileSystem fs = FileSystem.get(config);
FileStatus[] sources = fs.globStatus(inputGlobPath);
if (sources.length == 0) {
System.err.println("No files matching input glob:" + inputGlobPath.toString());
return 1;
}
List<Path> sourcePaths = new ArrayList<Path>(sources.length);
for (FileStatus source : sources) {
if (source.isDirectory()) {
System.err.println(source.getPath().toString() + " is a directory.");
return 1;
}
sourcePaths.add(source.getPath());
}
Path outputPath = new Path(jsapResult.getString(OUTPUT_ARG));
CompressionCodecFactory factory = new CompressionCodecFactory(config);
FSDataOutputStream countsOutputStream = null;
if (jsapResult.contains(COUNT_ARG)) {
Path countsPath = null;
countsPath = new Path(jsapResult.getString(COUNT_ARG));
countsOutputStream = fs.create(countsPath);
}
int lineCount = MergeSortTool.mergeSort(fs, sourcePaths, outputPath, factory);
System.out.println("Merged " + lineCount + " lines into " + outputPath.toString());
if (countsOutputStream != null) {
countsOutputStream.writeBytes("" + lineCount + '\n');
}
countsOutputStream.flush();
countsOutputStream.close();
return 0;
}
public static int mergeSort(FileSystem fs, List<Path> sourcePaths, Path outputPath, CompressionCodecFactory compressionCodecFactory) throws IOException {
assert sourcePaths.size() > 0 : "No source paths given.";
LOG.info("Sorted merge into " + outputPath.toString());
OutputStream outputStream = fs.create(outputPath);
CompressionCodec inputCompressionCodec = compressionCodecFactory.getCodec(sourcePaths.get(0));
if (inputCompressionCodec != null) {
LOG.info("Input compression codec " + inputCompressionCodec.getClass().getName());
}
CompressionCodec outputCompressionCodec = compressionCodecFactory.getCodec(outputPath);
if (outputCompressionCodec != null) {
LOG.info("Output compression codec " + outputCompressionCodec.getClass().getName());
outputStream = outputCompressionCodec.createOutputStream(outputStream);
}
List<BufferedReader> readers = new ArrayList<BufferedReader>();
OutputStreamWriter writer = new OutputStreamWriter(outputStream);
for (Path partPath : sourcePaths) {
LOG.info("\tAdding source " + partPath.toString());
InputStream inputStream = fs.open(partPath);
if (inputCompressionCodec != null) {
inputStream = inputCompressionCodec.createInputStream(inputStream);
}
BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream));
readers.add(reader);
}
int count = ReadersWriterMergeSort.mergeSort(readers, writer);
writer.close();
for (BufferedReader reader : readers) {
reader.close();
}
readers.clear();
LOG.info("Processed " + count + " lines into " + outputPath.toString());
return count;
}
}