/*
* Cloud9: A MapReduce Library for Hadoop
*
* Licensed under the Apache License, Version 2.0 (the "License"); you
* may not use this file except in compliance with the License. You may
* obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied. See the License for the specific language governing
* permissions and limitations under the License.
*/
package edu.umd.cloud9.webgraph.driver;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Partitioner;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.hadoop.mapred.lib.IdentityMapper;
import org.apache.hadoop.mapred.lib.IdentityReducer;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.log4j.Logger;
import tl.lin.data.array.ArrayListWritable;
import edu.umd.cloud9.webgraph.data.AnchorText;
/**
* <p>
* Main driver program for sorting the web graph. Command-line arguments are as follows:
* </p>
*
* <ul>
* <li>[input-path]: the input web graph (, (weighted) inverse web graph, etc.)</li>
* <li>[output-path]: the output path</li>
* <li>[number-of-documents]: an estimate of the number of pages in the graph</li>
* <li>[number-of-reducers]: number of reducers</li>
* </ul>
*
* @author Nima Asadi
*
*/
@SuppressWarnings("deprecation")
public class SortWebGraph extends Configured implements Tool {
private static final Logger LOG = Logger.getLogger(SortWebGraph.class);
private static final int DEFAULT_NUMBER_OF_DOCUMENTS = 503903810;
protected static class Partition implements
Partitioner<IntWritable, ArrayListWritable<AnchorText>> {
int totalDocuments;
public void configure(JobConf job) {
totalDocuments = job.getInt("Cloud9.NumberOfDocuments",
DEFAULT_NUMBER_OF_DOCUMENTS);
}
public int getPartition(IntWritable key,
ArrayListWritable<AnchorText> value, int numReduceTasks) {
int i = (key.get() / (totalDocuments / numReduceTasks));
if(i >= numReduceTasks) {
i = numReduceTasks - 1;
}
return i;
}
}
private static int printUsage() {
System.out.println("usage: [input-path] [output-path] " +
"[number-of-documents] [number-of-reducers]");
ToolRunner.printGenericCommandUsage(System.out);
return -1;
}
public int run(String[] args) throws Exception {
if(args.length != 4) {
printUsage();
return -1;
}
JobConf conf = new JobConf(getConf(), SortWebGraph.class);
FileSystem fs = FileSystem.get(conf);
String inputPath = args[0];
String outputPath = args[1];
int numberOfDocuments = Integer.parseInt(args[2]);
int numMappers = 1;
int numReducers = Integer.parseInt(args[3]);
conf.setJobName("SortWebGraph");
conf.set("mapred.child.java.opts", "-Xmx2048m");
conf.setInt("mapred.task.timeout", 60000000);
conf.set("mapreduce.map.memory.mb", "2048");
conf.set("mapreduce.map.java.opts", "-Xmx2048m");
conf.set("mapreduce.reduce.memory.mb", "2048");
conf.set("mapreduce.reduce.java.opts", "-Xmx2048m");
conf.set("mapreduce.task.timeout", "60000000");
if(numberOfDocuments == 0) {
numberOfDocuments = DEFAULT_NUMBER_OF_DOCUMENTS;
}
conf.setInt("Cloud9.NumberOfDocuments", numberOfDocuments);
conf.setNumMapTasks(numMappers);
conf.setNumReduceTasks(numReducers);
conf.setMapperClass(IdentityMapper.class);
conf.setPartitionerClass(Partition.class);
conf.setReducerClass(IdentityReducer.class);
conf.setOutputKeyClass(IntWritable.class);
conf.setOutputValueClass(ArrayListWritable.class);
conf.setMapOutputKeyClass(IntWritable.class);
conf.setMapOutputValueClass(ArrayListWritable.class);
conf.setInputFormat(SequenceFileInputFormat.class);
conf.setOutputFormat(SequenceFileOutputFormat.class);
SequenceFileOutputFormat.setCompressOutput(conf, true);
SequenceFileOutputFormat.setOutputCompressionType(conf,
SequenceFile.CompressionType.BLOCK);
SequenceFileInputFormat.setInputPaths(conf, inputPath);
FileOutputFormat.setOutputPath(conf, new Path(outputPath));
LOG.info("SortAnchorText");
LOG.info(" - input path: " + inputPath);
LOG.info(" - output path: " + outputPath);
LOG.info(" - number of documents: " +
conf.getInt("Cloud9.NumberOfDocuments", DEFAULT_NUMBER_OF_DOCUMENTS));
fs.delete(new Path(outputPath));
JobClient.runJob(conf);
return 0;
}
public static void main(String[] args) throws Exception {
int res = ToolRunner.run(new Configuration(), new SortWebGraph(), args);
System.exit(res);
}
}