package org.wikipedia.miner.extract.steps.pageDepth;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import org.apache.avro.mapred.AvroCollector;
import org.apache.avro.mapred.AvroMapper;
import org.apache.avro.mapred.Pair;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Reporter;
import org.apache.log4j.Logger;
import org.wikipedia.miner.extract.DumpExtractor;
import org.wikipedia.miner.extract.model.struct.PageDepthSummary;
import org.wikipedia.miner.extract.model.struct.PageDetail;
import org.wikipedia.miner.extract.model.struct.PageSummary;
import org.wikipedia.miner.extract.util.Languages;
import org.wikipedia.miner.extract.util.Languages.Language;
import org.wikipedia.miner.extract.util.SiteInfo;
import org.wikipedia.miner.extract.util.Util;
public class InitialDepthMapper extends AvroMapper<Pair<Integer, PageDetail>, Pair<Integer, PageDepthSummary>> {
private static Logger logger = Logger.getLogger(SubsequentDepthMapper.class) ;
private String rootCategoryTitle ;
@Override
public void configure(JobConf job) {
try {
Language language = null ;
Path[] cacheFiles = DistributedCache.getLocalCacheFiles(job);
for (Path cf:cacheFiles) {
if (cf.getName().equals(new Path(job.get(DumpExtractor.KEY_LANG_FILE)).getName())) {
language = Languages.load(new File(cf.toString())).get(job.get(DumpExtractor.KEY_LANG_CODE)) ;
}
}
if (language == null)
throw new Exception("Could not locate '" + job.get(DumpExtractor.KEY_LANG_FILE) + "' in DistributedCache") ;
rootCategoryTitle = Util.normaliseTitle(language.getRootCategory()) ;
} catch (Exception e) {
logger.error("Could not configure mapper", e);
}
logger.info(rootCategoryTitle) ;
}
@Override
public void map(Pair<Integer, PageDetail> pair,
AvroCollector<Pair<Integer, PageDepthSummary>> collector,
Reporter reporter) throws IOException {
if (rootCategoryTitle == null)
throw new IOException("Mapper not configured with root category title") ;
PageDetail page = pair.value() ;
if (!page.getNamespace().equals(SiteInfo.CATEGORY_KEY) && !page.getNamespace().equals(SiteInfo.MAIN_KEY)) {
//this only effects articles and categories, just discard other page types
return ;
}
if (page.getRedirectsTo() != null) {
//this doesn't effect redirects, so just discard them
return ;
}
PageDepthSummary depthSummary = new PageDepthSummary() ;
depthSummary.setChildIds(new ArrayList<Integer>()) ;
for (PageSummary childCat:page.getChildCategories())
depthSummary.getChildIds().add(childCat.getId()) ;
for (PageSummary childArt:page.getChildArticles())
depthSummary.getChildIds().add(childArt.getId()) ;
if (rootCategoryTitle.equals(page.getTitle().toString())) {
depthSummary.setDepth(0) ;
shareDepth(depthSummary, collector, reporter) ;
}
collect(page.getId(), depthSummary, collector);
}
public static void shareDepth(PageDepthSummary page, AvroCollector<Pair<Integer, PageDepthSummary>> collector, Reporter reporter) throws IOException {
if (page.getDepth() == null)
return ;
if (page.getDepthForwarded())
return ;
//logger.info("sharing depths for " + page.getTitle() + ": " + page.getDepth());
for (Integer childId:page.getChildIds()) {
PageDepthSummary child = new PageDepthSummary() ;
child.setDepth(page.getDepth() + 1);
child.setDepthForwarded(false);
child.setChildIds(new ArrayList<Integer>());
collect(childId, child, collector) ;
}
page.setDepthForwarded(true);
}
public static void collect(Integer pageId, PageDepthSummary pageDepth, AvroCollector<Pair<Integer, PageDepthSummary>> collector) throws IOException {
collector.collect(new Pair<Integer,PageDepthSummary>(pageId,pageDepth));
}
}