package org.wikipedia.miner.extract.steps.labelSenses; import java.io.IOException; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; import org.apache.avro.mapred.AvroCollector; import org.apache.avro.mapred.AvroReducer; import org.apache.avro.mapred.Pair; import org.apache.hadoop.mapred.Reporter; import org.wikipedia.miner.extract.model.struct.LabelSense; import org.wikipedia.miner.extract.model.struct.LabelSenseList; public abstract class CombinerOrReducer extends AvroReducer<CharSequence, LabelSenseList, Pair<CharSequence, LabelSenseList>> { public enum Counts {ambiguous, unambiguous} ; public abstract boolean isReducer() ; @Override public void reduce(CharSequence label, Iterable<LabelSenseList> senseLists, AvroCollector<Pair<CharSequence, LabelSenseList>> collector, Reporter reporter) throws IOException { LabelSenseList allSenses = new LabelSenseList() ; allSenses.setSenses(new ArrayList<LabelSense>()) ; for (LabelSenseList senses:senseLists) { for (LabelSense sense:senses.getSenses()) { allSenses.getSenses().add(LabelSense.newBuilder(sense).build()) ; } } if (isReducer()) { if (allSenses.getSenses().size() > 1) reporter.getCounter(Counts.ambiguous).increment(1L); else reporter.getCounter(Counts.unambiguous).increment(1L); Collections.sort(allSenses.getSenses(), new SenseComparator()); } collector.collect(new Pair<CharSequence, LabelSenseList>(label, allSenses)); } public static class Combiner extends CombinerOrReducer { @Override public boolean isReducer() { return false; } } public static class Reducer extends CombinerOrReducer { @Override public boolean isReducer() { return true; } } private static class SenseComparator implements Comparator<LabelSense> { @Override public int compare(LabelSense a, LabelSense b) { int cmp = b.getDocCount().compareTo(a.getDocCount()) ; if (cmp != 0) return cmp ; cmp = b.getOccCount().compareTo(a.getOccCount()) ; if (cmp != 0) return cmp ; return a.getId().compareTo(b.getId()); } } }