package uk.bl.wa.hadoop.mapreduce.nlp;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Properties;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.log4j.PropertyConfigurator;
import uk.bl.wa.hadoop.mapreduce.mdx.MDX;
import uk.bl.wa.hadoop.mapreduce.mdx.MDXWritable;
@SuppressWarnings({ "deprecation" })
public class Word2VecReducer extends MapReduceBase implements
Reducer<Text, MDXWritable, Text, Text> {
private static Log log = LogFactory.getLog(Word2VecReducer.class);
static enum MyCounters {
NUM_RECORDS, NUM_REVISITS, NUM_ERRORS, NUM_DROPPED_RECORDS, NUM_UNRESOLVED_REVISITS, NUM_RESOLVED_REVISITS
}
private static final Text revisit = new Text("revisit");
private static final Text response = new Text("response");
public Word2VecReducer() {
try {
Properties props = new Properties();
props.load(getClass().getResourceAsStream(
"/log4j-override.properties"));
PropertyConfigurator.configure(props);
} catch (IOException e1) {
log.error("Failed to load log4j config from properties file.");
}
}
/**
*/
@Override
public void configure(JobConf job) {
log.info("Initialisation complete.");
}
@Override
public void reduce(Text key, Iterator<MDXWritable> values,
OutputCollector<Text, Text> output, Reporter reporter)
throws IOException {
long noValues = 0;
MDXWritable mdx;
MDX exemplar = null;
List<MDXWritable> toReduplicate = new ArrayList<MDXWritable>();
while (values.hasNext()) {
mdx = values.next();
noValues++;
// Collect the revisit records:
if (revisit.equals(mdx.getRecordType())) {
// Add this revisit record to the stack:
reporter.incrCounter(MyCounters.NUM_REVISITS, 1);
toReduplicate.add(mdx);
} else {
// Record a response record:
if (exemplar == null && response.equals(mdx.getRecordType())) {
exemplar = mdx.getMDX();
}
// Collect complete records:
Text outKey = new Text(mdx.getHash());
output.collect(outKey, mdx.getMDXAsText());
}
// Report:
reporter.incrCounter(MyCounters.NUM_RECORDS, 1);
// Occasionally update status report:
if ((noValues % 1000) == 0) {
reporter.setStatus("Processed "
+ noValues
+ ", of which "
+ reporter.getCounter(MyCounters.NUM_REVISITS)
.getValue() + " records need reduplication.");
}
}
// Now fix up revisits:
for (MDXWritable rmdxw : toReduplicate) {
// Set outKey based on hash:
Text outKey = rmdxw.getHash();
// Handle merge:
if( exemplar != null ) {
// Modify record type and and merge the properties:
MDX rmdx = rmdxw.getMDX();
rmdx.setRecordType("reduplicated");
rmdx.getProperties().putAll(exemplar.getProperties());
reporter.incrCounter(MyCounters.NUM_RESOLVED_REVISITS, 1);
// Collect resolved records:
output.collect(outKey, new Text(rmdx.toJSON()));
} else {
reporter.incrCounter(MyCounters.NUM_UNRESOLVED_REVISITS, 1);
// Collect unresolved records:
output.collect(outKey, rmdxw.getMDXAsText());
}
}
}
}