package org.archive.hadoop.mapreduce;
import java.io.IOException;
import org.apache.hadoop.conf.Configurable;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.archive.util.StringFieldExtractor;
import org.archive.util.StringFieldExtractor.StringTuple;
public class SimpleTextMapper extends Mapper<Object, Text, Text, Text>
implements Configurable {
private static String TEXT_OUTPUT_DELIM_CONFIG = "text.output.delim";
public static int MODE_GLOBAL = 0;
public static int MODE_FULL = 1;
private Configuration conf;
private Text key = new Text();
private Text remainder = new Text();
private String delim = " ";
private char delimC = ' ';
private int keyCols = 2;
StringBuilder sb = new StringBuilder();
StringFieldExtractor sfe = new StringFieldExtractor(delimC,keyCols);
public void map(Object y, Text value, Context context) throws IOException,
InterruptedException {
StringTuple st = sfe.split(value.toString());
key.set(st.first);
remainder.set(st.second == null ? "" : st.second);
context.write(key, remainder);
}
public Configuration getConf() {
return conf;
}
public void setConf(Configuration conf) {
this.conf = conf;
delim = conf.get(TEXT_OUTPUT_DELIM_CONFIG, delim);
if(delim != null) {
sfe.setDelim(delim.charAt(0));
}
}
}