package org.archive.hadoop.mapreduce; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import java.net.URL; import org.apache.commons.httpclient.HttpClient; import org.apache.commons.httpclient.methods.GetMethod; import org.apache.commons.httpclient.methods.HeadMethod; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; import org.archive.hadoop.jobs.HTTPImportJob; import org.archive.util.StringFieldExtractor; import org.archive.util.StringFieldExtractor.StringTuple; public class HTTPImportMapper extends Mapper<LongWritable, Text, Text, Text> { public final int BUFSIZ = 4096; Path target = null; FileSystem filesystem = null; Text doneText = null; HttpClient client = null; boolean softFails = false; StringFieldExtractor sfe = new StringFieldExtractor(' ', 1); public final static String HTTP_IMPORT_TARGET = "http-import.target"; public final static String HTTP_IMPORT_SOFT_FAIL = "http-import.soft-fail"; public HTTPImportMapper() { } public void init2() { System.err.println("Init map..."); } public static void setTargetDir(Configuration conf, String path) { conf.set(HTTP_IMPORT_TARGET, path); } public static void setSoftFails(Configuration conf, boolean isSoftFail) { conf.setBoolean(HTTP_IMPORT_SOFT_FAIL, isSoftFail); } @Override protected void setup(Context context) throws IOException, InterruptedException { super.setup(context); Configuration conf = context.getConfiguration(); String targetString = conf.get(HTTPImportJob.HTTP_IMPORT_TARGET); if(targetString == null) { throw new IOException("No " + HTTPImportJob.HTTP_IMPORT_TARGET + " specified"); } softFails = conf.getBoolean(HTTP_IMPORT_SOFT_FAIL, false); target = new Path(targetString); filesystem = target.getFileSystem(conf); doneText = new Text("Done"); client = new HttpClient(); } @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String valueS = value.toString(); String name; String url = valueS; int idx = valueS.indexOf(' '); if(idx == -1) { URL tmpUrl = new URL(valueS); name = tmpUrl.getPath(); if(name.contains("/")) { name = name.substring(name.lastIndexOf('/')+1); } } else { StringTuple t = sfe.split(valueS); if((t.first == null) || (t.second == null)) { throw new IOException("Bad input line:" + valueS); } name = t.first; url = t.second; } Path thisTarget = new Path(target,name); Path thisTargetTmp = new Path(target,name+".TMP"); doCopy(url, thisTarget,thisTargetTmp); context.write(value, doneText); } private long getURLLengthByHead(String url) throws IOException { HeadMethod head = new HeadMethod(url); long urlLen = -1; // execute the method and handle any error responses. try { int code = client.executeMethod(head); if(code != 200) { if(softFails) { System.err.format("FAILED - non-200 for " + url); } else { throw new IOException("Non-200 for HEAD:" + url); } } urlLen = head.getResponseContentLength(); // discard: hope it's really empty (HEAD) and thus small... head.getResponseBody(); } finally { head.releaseConnection(); } return urlLen; } private long getPathLength(Path path) throws IOException { FileStatus stat = null; try { stat = filesystem.getFileStatus(path); // present.. check by size: } catch (FileNotFoundException e) { return -1; } return stat.getLen(); } private void doCopy(String url, Path target, Path targetTmp) throws IOException { // Check if the target exists (from previous map) long targetLen = getPathLength(target); long urlLen = -1; if(targetLen > -1) { // there's a file in the filesystem already, see if it's the // same length: urlLen = getURLLengthByHead(url); if(urlLen == targetLen) { // same size, assume it's done: return; } // diff length, do copy again, first remove old: if(!filesystem.delete(target, false)) { throw new IOException("Failed to delete old copy"); } } if(filesystem.isFile(targetTmp)) { filesystem.delete(targetTmp, false); } // do the copy: FSDataOutputStream out = filesystem.create(targetTmp, false); GetMethod get = new GetMethod(url); long copied = 0; try { int code = client.executeMethod(get); if(code != 200) { out.close(); filesystem.delete(targetTmp, false); if(softFails) { System.err.format("Warning - non 200 for " + url +":SKIP"); return; } else { throw new IOException("Non 200 on GET: " + url); } } urlLen = get.getResponseContentLength(); InputStream in = get.getResponseBodyAsStream(); byte buffer[] = new byte[BUFSIZ]; for(int cbread; (cbread = in.read(buffer)) >= 0; ) { out.write(buffer, 0, cbread); copied += cbread; } } finally { get.releaseConnection(); out.close(); } if(copied != urlLen) { // ack.. what went wrong? throw new IOException("Wrong copy length want(" + urlLen + ") got(" + copied + ") URL:" + url); } if(!filesystem.rename(targetTmp, target)) { throw new IOException(String.format("FAIL RENAME(%s)->(%s) URL(%s)", targetTmp.toString(), target.toString(),url)); } } }