/*
* Source code for Listing 12.2
*
*/
package mia.clustering.ch12.twitter;
import java.io.IOException;
import java.util.Map;
import java.util.regex.Pattern;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.mahout.common.Parameters;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class ByKeyMapper extends Mapper<LongWritable,Text,Text,Text> {
private static final Logger log = LoggerFactory
.getLogger(ByKeyMapper.class);
private Pattern splitter;
private int selectedField; // text of tweet
private int groupByField; // username
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException,
InterruptedException {
String[] fields = splitter.split(value.toString());
if (fields.length - 1 < selectedField || fields.length - 1 < groupByField) {
context.getCounter("Map", "LinesWithErrors").increment(1);
return;
}
String oKey = fields[groupByField];
String oValue = fields[selectedField];
context.write(new Text(oKey), new Text(oValue));
}
@Override
protected void setup(Context context) throws IOException,
InterruptedException {
super.setup(context);
Map<String,String> params = Parameters.parseParams(context
.getConfiguration().get("job.parameters", ""));
splitter = Pattern.compile(params.get("splitPattern"));
selectedField = Integer.valueOf(params.get("selectedField"));
groupByField = Integer.valueOf(params.get("groupByField"));
log.info("Using: {} {} {} ", new Object[] {groupByField,
splitter,
selectedField});
}
}