package org.wikipedia.miner.extract.steps.pageSummary;
import java.io.IOException;
import java.util.HashMap;
import org.apache.avro.mapred.AvroCollector;
import org.apache.avro.mapred.AvroMapper;
import org.apache.avro.mapred.Pair;
import org.apache.hadoop.mapred.Reporter;
import org.wikipedia.miner.extract.model.struct.LabelSummary;
import org.wikipedia.miner.extract.model.struct.LinkSummary;
import org.wikipedia.miner.extract.model.struct.PageDetail;
import org.wikipedia.miner.extract.model.struct.PageKey;
import org.wikipedia.miner.extract.model.struct.PageSummary;
public class SubsequentMapper extends AvroMapper<Pair<PageKey, PageDetail>, Pair<PageKey, PageDetail>> {
@Override
public void map(Pair<PageKey, PageDetail> pair,
AvroCollector<Pair<PageKey, PageDetail>> collector,
Reporter reporter) throws IOException {
PageKey pageKey = pair.key() ;
PageDetail page = pair.value() ;
if (page.getRedirectsTo() != null) {
//this is a redirect, so it has to be treated very differently from other page types
//is kind of a conduit that relations need to be forwarded through
CharSequence targetTitle = page.getRedirectsTo().getTitle() ;
PageKey targetKey = new PageKey(pageKey.getNamespace(), targetTitle) ;
PageDetail target = PageSummaryStep.buildEmptyPageDetail() ;
//if this target is resolved (we know its id), backtrack it to any redirects that point to this page
//so that they will also know what their eventual target is
if (page.getRedirectsTo().getId() > 0 && !page.getRedirectsTo().getForwarded()) {
for (PageSummary redirect:page.getRedirects()) {
//backtrack this redirect to the target of this page (so we are following down the redirect chain)
PageKey redirectKey = new PageKey(redirect.getNamespace(), redirect.getTitle()) ;
PageDetail redirectDetail = PageSummaryStep.buildEmptyPageDetail() ;
redirectDetail.setRedirectsTo(PageSummaryStep.clone(page.getRedirectsTo())) ;
collector.collect(new Pair<PageKey,PageDetail>(redirectKey, redirectDetail));
}
//and record that it has been backtracked
page.getRedirectsTo().setForwarded(true) ;
}
//if this redirect receives any redirects, forward them on to the target
for (PageSummary redirect:page.getRedirects()) {
if (redirect.getForwarded())
continue ;
//forward this redirect to the target of this page (so we are following down the redirect chain)
target.getRedirects().add(PageSummaryStep.clone(redirect)) ;
//and record that it has been forwarded
redirect.setForwarded(true);
}
for (LinkSummary linkIn:page.getLinksIn()) {
if (linkIn.getForwarded())
continue ;
//forward this link to the target of this page (so we are following down the redirect chain)
target.getLinksIn().add(PageSummaryStep.clone(linkIn)) ;
linkIn.setForwarded(true);
}
for (PageSummary childCategory:page.getChildCategories()) {
if (childCategory.getForwarded())
continue ;
target.getChildCategories().add(PageSummaryStep.clone(childCategory)) ;
childCategory.setForwarded(true);
}
for (PageSummary childArticle:page.getChildCategories()) {
if (childArticle.getForwarded())
continue ;
target.getChildArticles().add(PageSummaryStep.clone(childArticle)) ;
childArticle.setForwarded(true);
}
//redirects should not get any links out or parent relations, so do nothing with those
//immediately pass on any label counts to target
target.setLabels(page.getLabels());
//and remove them from here (otherwise they will get counted multiple times)
page.setLabels(new HashMap<CharSequence,LabelSummary>()) ;
//emit the details of the target that we have built up
collector.collect(new Pair<PageKey,PageDetail>(targetKey, target));
} else {
for (PageSummary redirect:page.getRedirects()) {
if (redirect.getForwarded())
continue ;
//backtrack, so the redirect knows what the resolved target is
PageKey redirectKey = new PageKey(redirect.getNamespace(), redirect.getTitle()) ;
PageDetail redirectDetail = PageSummaryStep.buildEmptyPageDetail() ;
redirectDetail.setRedirectsTo(new PageSummary(page.getId(), pageKey.getTitle(), pageKey.getNamespace(), false));
collector.collect(new Pair<PageKey,PageDetail>(redirectKey, redirectDetail));
//and record that it has been forwarded
redirect.setForwarded(true);
}
for (LinkSummary linkIn:page.getLinksIn()) {
if (linkIn.getForwarded())
continue ;
//backtrack, so the source of this link knows what the resolved target is
PageKey sourceKey = new PageKey(linkIn.getNamespace(), linkIn.getTitle()) ;
PageDetail sourceDetail = PageSummaryStep.buildEmptyPageDetail() ;
sourceDetail.getLinksOut().add(new LinkSummary(page.getId(), pageKey.getTitle(), pageKey.getNamespace(), false, linkIn.getSentenceIndexes()));
collector.collect(new Pair<PageKey,PageDetail>(sourceKey, sourceDetail));
//and record that it has been forwarded
linkIn.setForwarded(true);
}
for (LinkSummary linkOut:page.getLinksOut()) {
//immediately set these as forwarded, because we only get them if they have been forwarded and backtracked already
linkOut.setForwarded(true);
}
for (PageSummary childCategory:page.getChildCategories()) {
if (childCategory.getForwarded())
continue ;
//backtrack, so the child knows what the resolved parent is
PageKey childKey = new PageKey(childCategory.getNamespace(), childCategory.getTitle()) ;
PageDetail childDetail = PageSummaryStep.buildEmptyPageDetail() ;
childDetail.getParentCategories().add(new PageSummary(page.getId(), pageKey.getTitle(), pageKey.getNamespace(), false));
collector.collect(new Pair<PageKey,PageDetail>(childKey, childDetail));
//and record that it has been forwarded
childCategory.setForwarded(true);
}
for (PageSummary childArticle:page.getChildArticles()) {
if (childArticle.getForwarded())
continue ;
//backtrack, so the child knows what the resolved parent is
PageKey childKey = new PageKey(childArticle.getNamespace(), childArticle.getTitle()) ;
PageDetail childDetail = PageSummaryStep.buildEmptyPageDetail() ;
childDetail.getParentCategories().add(new PageSummary(page.getId(), pageKey.getTitle(), pageKey.getNamespace(), false));
collector.collect(new Pair<PageKey,PageDetail>(childKey, childDetail));
//and record that it has been forwarded
childArticle.setForwarded(true);
}
for (PageSummary parentCategory:page.getParentCategories()) {
//immediately set these as forwarded, because we only get them if they have been forwarded and backtracked already
parentCategory.setForwarded(true);
}
}
//emit the page, so we can pick it up again in the reducer
collector.collect(new Pair<PageKey,PageDetail>(pageKey, page));
}
}