/*
* This file is part of the Wayback archival access software
* (http://archive-access.sourceforge.net/projects/wayback/).
*
* Licensed to the Internet Archive (IA) by one or more individual
* contributors.
*
* The IA licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.archive.wayback.hadoop;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URL;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.zip.GZIPInputStream;
import org.apache.commons.httpclient.URIException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.hadoop.mapred.ClusterStatus;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.RunningJob;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;
import org.apache.hadoop.mapred.lib.IdentityMapper;
import org.apache.hadoop.mapred.lib.IdentityReducer;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.archive.wayback.util.ByteOp;
import org.archive.wayback.util.url.AggressiveUrlCanonicalizer;
public class CDXSort extends Configured implements Tool {
private RunningJob jobResult = null;
static int printUsage() {
System.out.println("cdxsort <split> <input> <output>");
ToolRunner.printGenericCommandUsage(System.out);
return -1;
}
/**
* The main driver for sort program. Invoke this method to submit the
* map/reduce job.
*
* @throws IOException
* When there is communication problems with the job tracker.
*/
public int run(String[] args) throws Exception {
boolean compressOutput = false;
boolean dereferenceInputs = false;
boolean canonicalize = false;
boolean funkyInput = false;
JobConf jobConf = new JobConf(getConf(), CDXSort.class);
jobConf.setJobName("cdxsort");
jobConf.setMapperClass(IdentityMapper.class);
jobConf.setReducerClass(IdentityReducer.class);
JobClient client = new JobClient(jobConf);
ClusterStatus cluster = client.getClusterStatus();
List<String> otherArgs = new ArrayList<String>();
for (int i = 0; i < args.length; ++i) {
try {
if ("-m".equals(args[i])) {
jobConf.setNumMapTasks(Integer.parseInt(args[++i]));
} else if ("--compress-output".equals(args[i])) {
compressOutput = true;
} else if ("--funky-input".equals(args[i])) {
funkyInput = true;
} else if ("--dereference-inputs".equals(args[i])) {
dereferenceInputs = true;
} else if ("--canonicalize".equals(args[i])) {
canonicalize = true;
} else {
otherArgs.add(args[i]);
}
} catch (NumberFormatException except) {
System.out.println("ERROR: Integer expected instead of "
+ args[i]);
return printUsage();
} catch (ArrayIndexOutOfBoundsException except) {
System.out.println("ERROR: Required parameter missing from "
+ args[i - 1]);
return printUsage(); // exits
}
}
// Make sure there are exactly 3 parameters left: split input output
if (otherArgs.size() != 3) {
System.out.println("ERROR: Wrong number of parameters: "
+ otherArgs.size() + " instead of 3.");
return printUsage();
}
String splitPath = otherArgs.get(0);
String inputPath = otherArgs.get(1);
String outputPath = otherArgs.get(2);
// load the split file, find and set the number of reduces
AlphaPartitioner partitioner = new AlphaPartitioner();
File localSplitFile = new File(splitPath);
FileInputStream fis = new FileInputStream(localSplitFile);
InputStreamReader isr = new InputStreamReader(fis,ByteOp.UTF8);
BufferedReader bis = new BufferedReader(isr);
// try {
// partitioner.loadBoundaries(bis);
// } catch (IOException except) {
// System.err.println("ERROR: Problem loading file " + splitPath);
// return printUsage(); // exits
// }
// jobConf.setNumReduceTasks(partitioner.getNumPartitions());
//
// // copy the split file into the FS, add to the DistributedCache:
//// AlphaPartitioner.setPartitionFile(jobConf, localSplitFile);
// AlphaPartitioner.setSplitCache(jobConf, localSplitFile);
// System.err.println("uploaded split file to FS and DistributedCache");
//
// // Set job configs:
// jobConf.setInputFormat(TextInputFormat.class);
//
// jobConf.setOutputFormat(TextOutputFormat.class);
// if (canonicalize) {
// jobConf.setMapperClass(CDXCanonicalizerMapClass.class);
// } else {
// jobConf.setMapperClass(CDXMapClass.class);
// }
// jobConf.setOutputKeyClass(Text.class);
// jobConf.setOutputValueClass(Text.class);
// jobConf.set("mapred.textoutputformat.separator", " ");
// jobConf.setPartitionerClass(AlphaPartitioner.class);
int inputCount = 0;
// Set job input:
if (dereferenceInputs) {
// SO SLOW... can't add one at a time...
// FileReader is2 = new FileReader(new File(inputPath));
// BufferedReader bis2 = new BufferedReader(is2);
// while (true) {
// String line = bis2.readLine();
// if (line == null) {
// break;
// }
// FileInputFormat.addInputPath(jobConf, new Path(line));
// inputCount++;
// System.err.println("Added path(" + inputCount + "): " + line);
// }
// PASS 2:
// FileReader is2 = new FileReader(new File(inputPath));
// BufferedReader bis2 = new BufferedReader(is2);
// ArrayList<String> list = new ArrayList<String>();
//
// while (true) {
// String line = bis2.readLine();
// if (line == null) {
// break;
// }
// list.add(line);
// inputCount++;
// }
// Path arr[] = new Path[list.size()];
// for(int i=0; i < list.size(); i++) {
// arr[i] = new Path(list.get(i));
// }
// FileInputFormat.setInputPaths(jobConf, arr);
// PASS 3:
if(funkyInput) {
jobConf.setMapperClass(FunkyDeReffingCDXCanonicalizerMapClass.class);
} else {
jobConf.setMapperClass(DeReffingCDXCanonicalizerMapClass.class);
}
FileInputFormat.setInputPaths(jobConf, new Path(inputPath));
inputCount = 1;
} else {
FileInputFormat.setInputPaths(jobConf, new Path(inputPath));
inputCount = 1;
}
// Set job output:
FileOutputFormat.setOutputPath(jobConf, new Path(outputPath));
if (compressOutput) {
FileOutputFormat.setCompressOutput(jobConf, true);
FileOutputFormat.setOutputCompressorClass(jobConf, GzipCodec.class);
}
// System.out.println("Running on " + cluster.getTaskTrackers()
// + " nodes, processing " + inputCount + " files/directories"
// + " into " + outputPath + " with "
// + partitioner.getNumPartitions() + " reduces.");
Date startTime = new Date();
System.out.println("Job started: " + startTime);
jobResult = JobClient.runJob(jobConf);
Date end_time = new Date();
System.out.println("Job ended: " + end_time);
System.out.println("The job took "
+ (end_time.getTime() - startTime.getTime()) / 1000
+ " seconds.");
return 0;
}
/**
* Mapper which reads a canonicalized CDX line, splitting into: key - URL +
* timestamp val - everything else
*
* @author brad
* @version $Date$, $Revision$
*/
public static class CDXMapClass extends MapReduceBase implements
Mapper<LongWritable, Text, Text, Text> {
private Text outKey = new Text();
private Text outValue = new Text();
public void map(LongWritable lineNumber, Text line,
OutputCollector<Text, Text> output, Reporter reporter)
throws IOException {
String tmp = line.toString();
int i1 = tmp.lastIndexOf(' ');
if(i1 > 0) {
outKey.set(tmp.substring(0,i1));
outValue.set(tmp.substring(i1+1));
output.collect(outKey, outValue);
} else {
System.err.println("Problem with line(" + tmp + ")");
}
// output.collect(line, outValue);
// reporter.setStatus("Running");
}
}
public static class FunkyDeReffingCDXCanonicalizerMapClass extends DeReffingCDXCanonicalizerMapClass {
protected Mapper<LongWritable, Text, Text, Text> getInner() {
return new FunkyCDXCanonicalizerMapClass();
}
}
public static class DeReffingCDXCanonicalizerMapClass extends MapReduceBase
implements Mapper<LongWritable, Text, Text, Text> {
protected Mapper<LongWritable, Text, Text, Text> getInner() {
return new CDXCanonicalizerMapClass();
}
/* (non-Javadoc)
* @see org.apache.hadoop.mapred.Mapper#map(java.lang.Object, java.lang.Object, org.apache.hadoop.mapred.OutputCollector, org.apache.hadoop.mapred.Reporter)
*/
public void map(LongWritable lineNo, Text urlText,
OutputCollector<Text, Text> output, Reporter reporter)
throws IOException {
LongWritable lw = new LongWritable();
Text tmp = new Text();
// CDXCanonicalizerMapClass inner = new CDXCanonicalizerMapClass();
Mapper<LongWritable, Text, Text, Text> inner = getInner();
// arg 1 is a URL
String urlString = urlText.toString();
InputStream is = null;
FileSystem fs = null;
if(urlString.startsWith("http://")) {
URL u = new URL(urlString.toString());
System.err.println("Openning URL stream for:" + urlString);
is = u.openStream();
} else {
System.err.println("Creating default Filesystem for:" + urlString);
fs = FileSystem.get(new Configuration(true));
Path p = new Path(urlString);
// FSDataInputStream fsdis = fs.open(p);
is = fs.open(p);
}
if(urlString.endsWith(".gz")) {
is = new GZIPInputStream(is);
}
try {
BufferedReader br = new BufferedReader(
new InputStreamReader(is,ByteOp.UTF8));
String tmpS = null;
long line = 0;
while((tmpS = br.readLine()) != null) {
lw.set(line++);
tmp.set(tmpS);
inner.map(lw, tmp, output, reporter);
}
is.close();
if(fs != null) {
fs.close();
}
} catch (IOException e) {
System.err.println("IOException with url:" + urlString);
e.printStackTrace();
throw e;
}
}
}
/**
* Mapper which reads an identity CDX line, outputting: key - canonicalized
* original URL + timestamp val - everything else
*
* @author brad
* @version $Date$, $Revision$
*/
public static class CDXCanonicalizerMapClass extends MapReduceBase
implements Mapper<LongWritable, Text, Text, Text> {
private Text outKey = new Text();
private Text outValue = new Text();
AggressiveUrlCanonicalizer canonicalizer = new AggressiveUrlCanonicalizer();
private StringBuilder ksb = new StringBuilder();
private int i1 = 0;
private int i2 = 0;
private int i3 = 0;
private int i4 = 0;
public void map(LongWritable lineNumber, Text line,
OutputCollector<Text, Text> output, Reporter reporter)
throws IOException {
String s = line.toString();
boolean problems = true;
i1 = s.indexOf(' ');
if(i1 > 0) {
i2 = s.indexOf(' ', i1 + 1);
if(i2 > 0) {
i3 = s.indexOf(' ', i2 + 1);
if(i3 > 0) {
i4 = s.lastIndexOf(' ');
if(i4 > i3) {
try {
ksb.setLength(0);
ksb.append(canonicalizer.urlStringToKey(s.substring(i2 + 1, i3)));
ksb.append(s.substring(i1,i4));
outKey.set(ksb.toString());
outValue.set(s.substring(i4+1));
output.collect(outKey, outValue);
problems = false;
} catch(URIException e) {
// just eat it.. problems will be true.
}
}
}
}
}
if(problems) {
System.err.println("CDX-Can: Problem with line("+s+")");
}
}
}
/**
* Mapper which reads an identity Funky format CDX line, outputting:
* key - canonicalized original URL + timestamp
* val - everything else
*
* input lines are a hybrid format:
*
* ORIG_URL
* DATE
* '-' (literal)
* MIME
* HTTP_CODE
* SHA1
* REDIRECT
* START_OFFSET
* ARC_PREFIX (sans .arc.gz)
* ROBOT_FLAG (combo of AIF - no: Archive,Index,Follow, or '-' if none)
*
* Ex:
* http://www.myow.de:80/news_show.php? 20061126032815 - text/html 200 DVKFPTOJGCLT3G5GUVLCETHLFO3222JM - 91098929 foo A
*
* Need to:
* . replace col 3 with orig url
* . replace col 1 with canonicalized orig url
* . replace SHA1 with first 4 digits of SHA1
* . append .arc.gz to ARC_PREFIX
* . omit lines with ROBOT_FLAG containing 'A'
* . remove last column
*
* @author brad
* @version $Date$, $Revision$
*/
public static class FunkyCDXCanonicalizerMapClass extends MapReduceBase
implements Mapper<LongWritable, Text, Text, Text> {
private static int SHA1_DIGITS = 3;
private Text outKey = new Text();
private Text outValue = new Text();
AggressiveUrlCanonicalizer canonicalizer = new AggressiveUrlCanonicalizer();
private StringBuilder ksb = new StringBuilder();
private StringBuilder vsb = new StringBuilder();
private int i1 = 0;
private int i2 = 0;
private int i3 = 0;
private int i4 = 0;
public void map(LongWritable lineNumber, Text line,
OutputCollector<Text, Text> output, Reporter reporter)
throws IOException {
String s = line.toString();
String parts[] = s.split(" ");
boolean problems = true;
if(parts.length == 10) {
if(!parts[9].contains("A")) {
ksb.setLength(0);
vsb.setLength(0);
try {
ksb.append(canonicalizer.urlStringToKey(parts[0])).append(" ");
ksb.append(parts[1]); // date
vsb.append(parts[0]).append(" "); // orig_url
vsb.append(parts[3]).append(" "); // MIME
vsb.append(parts[4]).append(" "); // HTTP_CODE
vsb.append(parts[5].substring(0, SHA1_DIGITS)).append(" "); // SHA1
vsb.append(parts[6]).append(" "); // redirect
vsb.append(parts[7]).append(" "); // start_offset
vsb.append(parts[8]).append(".arc.gz"); // arc_prefix
outKey.set(ksb.toString());
outValue.set(vsb.toString());
output.collect(outKey, outValue);
} catch (URIException e) {
System.err.println("Failed Canonicalize:("+parts[0]+
") in ("+parts[8]+"):("+parts[7]+")");
}
}
} else {
System.err.println("Funky: Problem with line("+s+")");
}
}
}
public static void main(String[] args) throws Exception {
int res = ToolRunner.run(new Configuration(), new CDXSort(), args);
System.exit(res);
}
/**
* Get the last job that was run using this instance.
*
* @return the results of the last job that was run
*/
public RunningJob getResult() {
return jobResult;
}
}