/*
* This file is part of the Wayback archival access software
* (http://archive-access.sourceforge.net/projects/wayback/).
*
* Licensed to the Internet Archive (IA) by one or more individual
* contributors.
*
* The IA licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.archive.wayback.hadoop;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.List;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.archive.wayback.util.ByteOp;
/**
* @author brad
*
*/
public class CDXSortDriver implements Tool {
Configuration conf = null;
/**
* As hard-coded into the Text RecordWriter
*/
public static String TEXT_OUTPUT_DELIM_CONFIG =
"mapred.textoutputformat.separator";
private static int countLinesInPath(Path path, Configuration conf)
throws IOException {
FileSystem fs = path.getFileSystem(conf);
FSDataInputStream is = fs.open(path);
BufferedReader br = new BufferedReader(new InputStreamReader(is, ByteOp.UTF8));
int lineCount = 0;
while (br.readLine() != null) {
lineCount++;
}
is.close();
return lineCount;
}
static int printUsage() {
System.out.println("cdxsort <split> <input> <output>");
System.out.println("cdxsort [OPTIONS] <split> <input> <output>");
System.out.println("\tOPTIONS can be:");
System.out.println("\t\t-m NUM - try to run with approximately NUM map tasks");
System.out.println("\t\t--compressed-input - assume input is compressed, even without .gz suffix");
System.out.println("\t\t--gzip-range - assume input lines are PATH START LENGTH such that a");
System.out.println("\t\t\t valid gzip record exists in PATH between START and START+LENGTH");
System.out.println("\t\t\t that contains the records to process");
System.out.println("\t\t--compress-output - compress output files with GZip");
System.out.println("\t\t--delimiter DELIM - assume DELIM delimter for input and output, instead of default <SPACE>");
System.out.println("\t\t--map-global - use the GLOBAL CDX map function, which implies:");
System.out.println("\t\t\t. extra trailing field indicating HTML meta NOARCHIVE data, which should be omitted, result lines do not include the last field");
System.out.println("\t\t\t. truncating digest field to 3 digits");
System.out.println("\t\t\t. column 0 is original URL (identity CDX files)");
System.out.println();
// ToolRunner.printGenericCommandUsage(System.out);
return -1;
}
/**
* The main driver for sort program. Invoke this method to submit the
* map/reduce job.
*
* @throws IOException
* When there is communication problems with the job tracker.
*/
public int run(String[] args) throws Exception {
String delim = " ";
long desiredMaps = 10;
boolean compressOutput = false;
boolean compressedInput = false;
boolean gzipRange = false;
List<String> otherArgs = new ArrayList<String>();
int mapMode = CDXCanonicalizingMapper.MODE_FULL;
for (int i = 0; i < args.length; ++i) {
try {
if ("-m".equals(args[i])) {
desiredMaps = Integer.parseInt(args[++i]);
} else if ("--compress-output".equals(args[i])) {
compressOutput = true;
} else if ("--compressed-input".equals(args[i])) {
compressedInput = true;
} else if ("--gzip-range".equals(args[i])) {
gzipRange = true;
} else if ("--delimiter".equals(args[i])) {
delim = args[++i];
} else if ("--map-full".equals(args[i])) {
mapMode = CDXCanonicalizingMapper.MODE_FULL;
} else if ("--map-global".equals(args[i])) {
mapMode = CDXCanonicalizingMapper.MODE_GLOBAL;
} else {
otherArgs.add(args[i]);
}
} catch (NumberFormatException except) {
System.out.println("ERROR: Integer expected instead of "
+ args[i]);
return printUsage();
} catch (ArrayIndexOutOfBoundsException except) {
System.out.println("ERROR: Required parameter missing from "
+ args[i - 1]);
return printUsage(); // exits
}
}
// Make sure there are exactly 3 parameters left: split input output
if (otherArgs.size() != 3) {
System.out.println("ERROR: Wrong number of parameters: "
+ otherArgs.size() + " instead of 3.");
return printUsage();
}
String splitPathString = otherArgs.get(0);
String inputPathString = otherArgs.get(1);
String outputPathString = otherArgs.get(2);
Path splitPath = new Path(splitPathString);
Path inputPath = new Path(inputPathString);
Path outputPath = new Path(outputPathString);
Job job = new Job(getConf(), "cdx-sort");
Configuration conf = job.getConfiguration();
job.setJarByClass(CDXSortDriver.class);
job.setMapperClass(CDXCanonicalizingMapper.class);
job.setReducerClass(CDXReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
// configure the "map mode"
CDXCanonicalizingMapper.setMapMode(conf, mapMode);
// set up the delimter:
conf.set(TEXT_OUTPUT_DELIM_CONFIG, delim);
if (compressOutput) {
FileOutputFormat.setCompressOutput(job, true);
FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);
}
// set up the Partitioner, including number of reduce tasks:
FileSystem fs = inputPath.getFileSystem(conf);
int splitCount = countLinesInPath(splitPath, conf);
System.err.println("Split/Reduce count:" + splitCount);
job.setNumReduceTasks(splitCount);
AlphaPartitioner.setPartitionPath(conf, splitPathString);
job.setPartitionerClass(AlphaPartitioner.class);
// calculate the byte size to get the correct number of map tasks:
FileStatus inputStatus = fs.getFileStatus(inputPath);
long inputLen = inputStatus.getLen();
long bytesPerMap = (int) inputLen / desiredMaps;
FileInputFormat.addInputPath(job, inputPath);
FileInputFormat.setMaxInputSplitSize(job, bytesPerMap);
if(gzipRange) {
job.setInputFormatClass(GZIPRangeLineDereferencingInputFormat.class);
} else {
job.setInputFormatClass(LineDereferencingInputFormat.class);
if(compressedInput) {
LineDereferencingRecordReader.forceCompressed(conf);
}
}
FileOutputFormat.setOutputPath(job, outputPath);
return (job.waitForCompletion(true) ? 0 : 1);
}
/**
* @param args
* @throws Exception
*/
public static void main(String[] args) throws Exception {
int res = ToolRunner.run(new Configuration(), new CDXSortDriver(), args);
System.exit(res);
}
public Configuration getConf() {
return conf;
}
public void setConf(Configuration conf) {
this.conf = conf;
}
}