package org.archive.wayback.hadoop;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.methods.HeadMethod;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Mapper.Context;
public class HTTPImportMapper extends Mapper<LongWritable, Text, Text, Text> {
public final int BUFSIZ = 4096;
Path target = null;
FileSystem filesystem = null;
Text doneText = null;
HttpClient client = null;
public HTTPImportMapper() {
}
public void init2() {
System.err.println("Init map...");
}
@Override
protected void setup(Context context) throws IOException,
InterruptedException {
super.setup(context);
Configuration conf = context.getConfiguration();
String targetString = conf.get(HTTPImportJob.HTTP_IMPORT_TARGET);
if(targetString == null) {
throw new IOException("No " + HTTPImportJob.HTTP_IMPORT_TARGET
+ " specified");
}
target = new Path(targetString);
filesystem = target.getFileSystem(conf);
doneText = new Text("Done");
client = new HttpClient();
}
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String valueS = value.toString();
String name;
String url = valueS;
int idx = valueS.indexOf(' ');
if(idx == -1) {
URL tmpUrl = new URL(valueS);
name = tmpUrl.getPath();
if(name.contains("/")) {
name = name.substring(name.lastIndexOf('/')+1);
}
} else {
name = valueS.substring(0,idx);
url = valueS.substring(idx+1);
}
Path thisTarget = new Path(target,name);
doCopy(url, thisTarget);
context.write(value, doneText);
}
private long getURLLengthByHead(String url) throws IOException {
HeadMethod head = new HeadMethod(url);
long urlLen = -1;
// execute the method and handle any error responses.
try {
int code = client.executeMethod(head);
if(code != 200) {
throw new IOException("Non-200 for HEAD:" + url);
}
urlLen = head.getResponseContentLength();
// discard: hope it's really empty (HEAD) and thus small...
head.getResponseBody();
} finally {
head.releaseConnection();
}
return urlLen;
}
private long getPathLength(Path path) throws IOException {
FileStatus stat = null;
try {
stat = filesystem.getFileStatus(path);
// present.. check by size:
} catch (FileNotFoundException e) {
return -1;
}
return stat.getLen();
}
private void doCopy(String url, Path target) throws IOException {
// Check if the target exists (from previous map)
long targetLen = getPathLength(target);
long urlLen = -1;
if(targetLen > -1) {
// there's a file in the filesystem already, see if it's the
// same length:
urlLen = getURLLengthByHead(url);
if(urlLen == targetLen) {
// same size, assume it's done:
return;
}
// diff length, do copy again, first remove old:
if(!filesystem.delete(target, false)) {
throw new IOException("Failed to delete old copy");
}
}
// do the copy:
FSDataOutputStream out = filesystem.create(target, false);
GetMethod get = new GetMethod(url);
long copied = 0;
try {
int code = client.executeMethod(get);
if(code != 200) {
throw new IOException("Non 200 on GET: " + url);
}
urlLen = get.getResponseContentLength();
InputStream in = get.getResponseBodyAsStream();
byte buffer[] = new byte[BUFSIZ];
for(int cbread; (cbread = in.read(buffer)) >= 0; ) {
out.write(buffer, 0, cbread);
copied += cbread;
}
} finally {
get.releaseConnection();
out.close();
}
if(copied != urlLen) {
// ack.. what went wrong?
throw new IOException("Wrong copy length want(" + urlLen
+ ") got(" + copied + ") URL:" + url);
}
}
}