package org.archive.hadoop.mapreduce;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.net.URL;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.archive.extract.ExtractingResourceFactoryMapper;
import org.archive.extract.ExtractingResourceProducer;
import org.archive.extract.ExtractorOutput;
import org.archive.extract.ProducerUtils;
import org.archive.extract.ResourceFactoryMapper;
import org.archive.extract.WATExtractorOutput;
import org.archive.hadoop.jobs.WATExtractorJob;
import org.archive.resource.Resource;
import org.archive.resource.ResourceProducer;
import org.archive.util.StringFieldExtractor;
import org.archive.util.StringFieldExtractor.StringTuple;
public class WATExtractorMapper extends
Mapper<Object, Text, Text, Text> {
public final static String WAT_EXTRACTOR_TARGET = "wat-extractor.target";
public final static String WAT_EXTRACTOR_OVERRIDE = "wat-extractor.override";
Path target = null;
FileSystem filesystem = null;
boolean overrideExistentFile = false;
StringFieldExtractor sfe = new StringFieldExtractor(' ', 1);
public static void setTargetDir(Configuration conf, String path) {
conf.set(WAT_EXTRACTOR_TARGET, path);
}
@Override
protected void setup(Context context) throws IOException,
InterruptedException {
super.setup(context);
Configuration conf = context.getConfiguration();
String targetString = conf.get(WATExtractorJob.WAT_EXTRACT_TARGET);
overrideExistentFile = conf.getBoolean(WAT_EXTRACTOR_OVERRIDE, false);
target = new Path(targetString);
filesystem = target.getFileSystem(conf);
}
public void map(Object y, Text value, Context context)
throws IOException, InterruptedException {
//PArse the URL files
String valueS = value.toString();
String name;
String url = valueS;
int idx = valueS.indexOf(' ');
if(idx == -1) {
URL tmpUrl = new URL(valueS);
name = tmpUrl.getPath();
if(name.contains("/")) {
name = name.substring(name.lastIndexOf('/')+1);
}
} else {
StringTuple t = sfe.split(valueS);
if((t.first == null) || (t.second == null)) {
throw new IOException("Bad input line:" + valueS);
}
name = t.first;
url = t.second;
}
Path thisTarget = new Path(target,name);
Path thisTargetTmp = new Path(target,name+".wat.gz");
doExtract(url, thisTarget,thisTargetTmp);
}
private void doExtract(String url, Path target, Path targetTmp) throws IOException {
// Check if the target exists (from previous map)
long targetLen = getPathLength(target);
int max = Integer.MAX_VALUE;
if(targetLen > -1) {
// there's a file in the filesystem already,
if(overrideExistentFile){
if(!filesystem.delete(target, false)) {
throw new IOException("Failed to delete old copy");
}
} else {
return;
}
}
FSDataOutputStream fsdOut = filesystem.create(targetTmp, false);
ExtractorOutput out;
out = new WATExtractorOutput(fsdOut);
ResourceProducer producer = ProducerUtils.getProducer(url);
ResourceFactoryMapper mapper = new ExtractingResourceFactoryMapper();
ExtractingResourceProducer exProducer =
new ExtractingResourceProducer(producer, mapper);
Logger.getLogger("org.archive").setLevel(Level.WARNING);
int count = 0;
int incr = 1;
while(count < max) {
try {
Resource r = exProducer.getNext();
if(r == null) {
break;
}
count += incr;
out.output(r);
} catch(Exception e){
e.printStackTrace();
}
}
}
private long getPathLength(Path path) throws IOException {
FileStatus stat = null;
try {
stat = filesystem.getFileStatus(path);
// present.. check by size:
} catch (FileNotFoundException e) {
return -1;
}
return stat.getLen();
}
public static void setOverride(Configuration conf, boolean isOverride) {
conf.setBoolean(WAT_EXTRACTOR_OVERRIDE, isOverride);
}
}