package org.wikipedia.miner.extract.steps.labelSenses;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map;
import java.util.Map.Entry;
import org.apache.avro.mapred.AvroCollector;
import org.apache.avro.mapred.AvroMapper;
import org.apache.avro.mapred.Pair;
import org.apache.hadoop.mapred.Reporter;
import org.wikipedia.miner.extract.model.struct.LabelSense;
import org.wikipedia.miner.extract.model.struct.LabelSenseList;
import org.wikipedia.miner.extract.model.struct.LabelSummary;
import org.wikipedia.miner.extract.model.struct.PageDetail;
import org.wikipedia.miner.extract.model.struct.PageSummary;
import org.wikipedia.miner.extract.util.SiteInfo;
public class Mapper extends AvroMapper<Pair<Integer, PageDetail>, Pair<CharSequence, LabelSenseList>> {
@Override
public void map(Pair<Integer, PageDetail> pair,
AvroCollector<Pair<CharSequence, LabelSenseList>> collector,
Reporter reporter) throws IOException {
PageDetail page = pair.value() ;
//we only care about articles
if (!page.getNamespace().equals(SiteInfo.MAIN_KEY))
return ;
//we don't care about redirects
if (page.getRedirectsTo() != null)
return ;
Map<CharSequence,LabelSense> labelSenses = new HashMap<CharSequence,LabelSense>() ;
for (Entry<CharSequence, LabelSummary> e:page.getLabels().entrySet()) {
CharSequence label = e.getKey() ;
LabelSummary stats = e.getValue() ;
LabelSense sense = new LabelSense() ;
sense.setId(page.getId()) ;
sense.setDocCount(stats.getDocCount());
sense.setOccCount(stats.getDocCount());
sense.setFromTitle(false);
sense.setFromRedirect(false);
labelSenses.put(label, sense) ;
}
LabelSense titleSense = labelSenses.get(page.getTitle()) ;
if (titleSense == null) {
titleSense = new LabelSense() ;
titleSense.setId(page.getId()) ;
titleSense.setDocCount(0);
titleSense.setOccCount(0);
titleSense.setFromRedirect(false);
}
titleSense.setFromTitle(true);
labelSenses.put(page.getTitle(), titleSense) ;
for (PageSummary redirect : page.getRedirects()) {
LabelSense redirectSense = labelSenses.get(redirect.getTitle()) ;
if (redirectSense == null) {
redirectSense = new LabelSense() ;
redirectSense.setId(page.getId()) ;
redirectSense.setDocCount(0);
redirectSense.setOccCount(0);
redirectSense.setFromTitle(false);
}
redirectSense.setFromRedirect(true);
labelSenses.put(redirect.getTitle(), redirectSense) ;
}
for (Entry<CharSequence, LabelSense> e:labelSenses.entrySet()) {
LabelSenseList s = new LabelSenseList() ;
s.setSenses(new ArrayList<LabelSense>()) ;
s.getSenses().add(e.getValue()) ;
collector.collect(new Pair<CharSequence,LabelSenseList>(e.getKey(), s));
}
}
}