package org.wikipedia.miner.extract.steps.pageSummary;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.SortedMap;
import java.util.TreeMap;
import org.apache.avro.mapred.AvroCollector;
import org.apache.avro.mapred.AvroReducer;
import org.apache.avro.mapred.Pair;
import org.apache.hadoop.mapred.Reporter;
import org.apache.log4j.Logger;
import org.wikipedia.miner.extract.model.struct.LabelSummary;
import org.wikipedia.miner.extract.model.struct.LinkSummary;
import org.wikipedia.miner.extract.model.struct.PageDetail;
import org.wikipedia.miner.extract.model.struct.PageKey;
import org.wikipedia.miner.extract.model.struct.PageSummary;
import org.wikipedia.miner.extract.steps.pageSummary.PageSummaryStep.Unforwarded;
public abstract class CombinerOrReducer extends AvroReducer<PageKey, PageDetail, Pair<PageKey, PageDetail>> {
private static Logger logger = Logger.getLogger(CombinerOrReducer.class) ;
public abstract boolean isReducer() ;
private CharSequence[] debugTitles = {"Atheist","Atheism","Atheists","Athiest","People by religion"} ;
@Override
public void reduce(PageKey key, Iterable<PageDetail> pagePartials,
AvroCollector<Pair<PageKey, PageDetail>> collector,
Reporter reporter) throws IOException {
Integer id = null;
//Integer namespace = key.getNamespace() ;
CharSequence title = key.getTitle() ;
Long lastEdited = null ;
boolean isDisambiguation = false ;
List<Integer> sentenceSplits = new ArrayList<Integer>() ;
SortedMap<Integer,PageSummary> redirects = new TreeMap<Integer, PageSummary>() ;
PageSummary redirectsTo = null ;
SortedMap<Integer,LinkSummary> linksIn = new TreeMap<Integer, LinkSummary>() ;
SortedMap<Integer,LinkSummary> linksOut = new TreeMap<Integer, LinkSummary>() ;
SortedMap<Integer,PageSummary> parentCategories = new TreeMap<Integer, PageSummary>() ;
SortedMap<Integer,PageSummary> childCategories = new TreeMap<Integer, PageSummary>() ;
SortedMap<Integer,PageSummary> childArticles = new TreeMap<Integer, PageSummary>() ;
SortedMap<CharSequence,LabelSummary> labels = new TreeMap<CharSequence,LabelSummary>() ;
boolean debug = false ;
for(CharSequence debugTitle:debugTitles) {
if (title.equals(debugTitle))
debug = true ;
}
if (debug)
logger.info("Processing " + key.toString()) ;
for (PageDetail pagePartial: pagePartials) {
if (debug)
logger.info("partial: " + pagePartial.toString());
if (pagePartial.getId() != null)
id = pagePartial.getId() ;
if (pagePartial.getLastEdited() != null)
lastEdited = pagePartial.getLastEdited() ;
if (pagePartial.getIsDisambiguation())
isDisambiguation = true ;
if (pagePartial.getRedirectsTo() != null) {
if (debug)
logger.info(" -" + pagePartial.getRedirectsTo() + " vs " + redirectsTo) ;
if (redirectsTo == null || redirectsTo.getId() < 0) {
//always clobber a redirectTo that hasn't been resolved to an id yet
redirectsTo = PageSummary.newBuilder(pagePartial.getRedirectsTo()).build() ;
}else {
if (pagePartial.getRedirectsTo().getForwarded())
redirectsTo.setForwarded(true);
}
if (debug)
logger.info(" - " + redirectsTo) ;
}
//we cant to do a straight copy, because avro seems to reuse these instances.
sentenceSplits.addAll(pagePartial.getSentenceSplits()) ;
redirects = addToPageMap(pagePartial.getRedirects(), redirects) ;
linksIn = addToLinkMap(pagePartial.getLinksIn(), linksIn) ;
linksOut = addToLinkMap(pagePartial.getLinksOut(), linksOut) ;
parentCategories = addToPageMap(pagePartial.getParentCategories(), parentCategories) ;
childCategories = addToPageMap(pagePartial.getChildCategories(), childCategories) ;
childArticles = addToPageMap(pagePartial.getChildArticles(), childArticles) ;
for (Map.Entry<CharSequence, LabelSummary> e:pagePartial.getLabels().entrySet()) {
CharSequence label = e.getKey() ;
LabelSummary labelStats = labels.get(label) ;
if (labelStats == null)
labelStats = new LabelSummary(0,0) ;
labelStats.setDocCount(labelStats.getDocCount() + e.getValue().getDocCount());
labelStats.setOccCount(labelStats.getOccCount() + e.getValue().getOccCount());
labels.put(label, labelStats) ;
}
}
if (id == null && isReducer()) {
//if we don't know the id of the page by this point, then it must be the
//result of an unresolvable redirect or link (so forget it)
//logger.warn("Orphaned page title: " + key.getTitle() + " in ns " + key.getNamespace()) ;
return ;
}
if (debug) {
for (Integer rId:redirects.keySet())
logger.info(" - " + rId+ ":" + redirects.get(rId)) ;
}
PageDetail combinedPage = PageSummaryStep.buildEmptyPageDetail() ;
combinedPage.setId(id) ;
//combinedPage.setTitle(title);
//combinedPage.setNamespace(namespace);
combinedPage.setIsDisambiguation(isDisambiguation);
combinedPage.setLastEdited(lastEdited) ;
combinedPage.setSentenceSplits(sentenceSplits);
combinedPage.setRedirectsTo(redirectsTo);
boolean isRedirect = redirectsTo != null ;
//redirects always need forwarding
combinedPage.setRedirects(convertPagesToList(redirects, true));
//links in always need to be backtracked (or forwarded by redirects)
combinedPage.setLinksIn(convertLinksToList(linksIn, true));
//links out only need to be forwarded by redirects
combinedPage.setLinksOut(convertLinksToList(linksOut, isRedirect));
//parent categories only need to be forwarded by redirects
combinedPage.setParentCategories(convertPagesToList(parentCategories, isRedirect));
//children of both types always need to be backtracked to parent (or forwarded by redirect)
combinedPage.setChildCategories(convertPagesToList(childCategories, true));
combinedPage.setChildArticles(convertPagesToList(childArticles, true));
combinedPage.setLabels(labels);
//count stuff that needs to be forwarded, so we know wheither another iteration is needed
if (isReducer()) {
countUnforwardedPages(Unforwarded.redirect, combinedPage.getRedirects(), reporter) ;
if (redirectsTo != null && redirectsTo.getId() >= 0 && !redirectsTo.getForwarded())
reporter.incrCounter(Unforwarded.redirect, 1);
countUnforwardedLinks(Unforwarded.linkIn, combinedPage.getLinksIn(), reporter) ;
countUnforwardedLinks(Unforwarded.linkOut, combinedPage.getLinksOut(), reporter) ;
countUnforwardedPages(Unforwarded.parentCategory, combinedPage.getParentCategories(), reporter) ;
countUnforwardedPages(Unforwarded.childCategory, combinedPage.getChildCategories(), reporter) ;
countUnforwardedPages(Unforwarded.childArticle, combinedPage.getChildArticles(), reporter) ;
}
if (debug)
logger.info("combined: " + combinedPage.toString());
collector.collect(new Pair<PageKey,PageDetail>(key, combinedPage));
}
private SortedMap<Integer,PageSummary> addToPageMap(List<PageSummary> pages, SortedMap<Integer,PageSummary> pageMap) {
if (pages == null || pages.isEmpty())
return pageMap ;
for (PageSummary link:pages) {
//only overwrite if previous entry has not been forwarded
PageSummary existingPage = pageMap.get(link.getId()) ;
if (existingPage == null) {
//the clone is needed because avro seems to reuse these instances.
//if we don't clone it, it will get overwritten later
pageMap.put(link.getId(), PageSummaryStep.clone(link)) ;
} else {
if (link.getForwarded())
existingPage.setForwarded(true) ;
//linkMap.put(existingLink.getId(), existingLink) ;
}
}
return pageMap ;
}
private SortedMap<Integer,LinkSummary> addToLinkMap(List<LinkSummary> links, SortedMap<Integer,LinkSummary> linkMap) {
if (links == null || links.isEmpty())
return linkMap ;
for (LinkSummary link:links) {
//only overwrite if previous entry has not been forwarded
LinkSummary existingLink = linkMap.get(link.getId()) ;
if (existingLink == null) {
//the clone is needed because avro seems to reuse these instances.
//if we don't clone it, it will get overwritten later
linkMap.put(link.getId(), PageSummaryStep.clone(link)) ;
} else {
//merge lists of sentence indexes
for (Integer sentenceIndex:link.getSentenceIndexes()) {
int pos = Collections.binarySearch(existingLink.getSentenceIndexes(), sentenceIndex) ;
if (pos<0)
existingLink.getSentenceIndexes().add((-pos) - 1, sentenceIndex) ;
}
//overwrite forwarded flag
if (link.getForwarded())
existingLink.setForwarded(true) ;
}
}
return linkMap ;
}
private List<PageSummary> convertPagesToList(SortedMap<Integer,PageSummary> pageMap, boolean requiresForwarding) {
List<PageSummary> pages = new ArrayList<PageSummary>() ;
for (PageSummary page:pageMap.values()) {
if (!requiresForwarding)
page.setForwarded(true) ;
pages.add(page) ;
}
return pages ;
}
private List<LinkSummary> convertLinksToList(SortedMap<Integer,LinkSummary> linkMap, boolean requiresForwarding) {
List<LinkSummary> links = new ArrayList<LinkSummary>() ;
for (LinkSummary link:linkMap.values()) {
if (!requiresForwarding)
link.setForwarded(true) ;
links.add(link) ;
}
return links ;
}
private void countUnforwardedPages(Unforwarded counter, List<PageSummary> pages, Reporter reporter) {
for (PageSummary page:pages)
if (!page.getForwarded())
reporter.incrCounter(counter, 1);
}
private void countUnforwardedLinks(Unforwarded counter, List<LinkSummary> links, Reporter reporter) {
for (LinkSummary link:links)
if (!link.getForwarded())
reporter.incrCounter(counter, 1);
}
public static class Combiner extends CombinerOrReducer {
@Override
public boolean isReducer() {
return false;
}
}
public static class Reducer extends CombinerOrReducer {
@Override
public boolean isReducer() {
return true;
}
}
}