package org.wikipedia.miner.extract.steps.pageSummary; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Vector; import org.apache.avro.mapred.AvroKey; import org.apache.avro.mapred.AvroValue; import org.apache.hadoop.filecache.DistributedCache; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.MapReduceBase; import org.apache.hadoop.mapred.Mapper; import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.Reporter; import org.apache.log4j.Logger; import org.wikipedia.miner.extract.DumpExtractor; import org.wikipedia.miner.extract.model.DumpLink; import org.wikipedia.miner.extract.model.DumpLinkParser; import org.wikipedia.miner.extract.model.DumpPage; import org.wikipedia.miner.extract.model.DumpPageParser; import org.wikipedia.miner.extract.model.struct.LabelSummary; import org.wikipedia.miner.extract.model.struct.LinkSummary; import org.wikipedia.miner.extract.model.struct.PageDetail; import org.wikipedia.miner.extract.model.struct.PageKey; import org.wikipedia.miner.extract.model.struct.PageSummary; import org.wikipedia.miner.extract.steps.pageSummary.PageSummaryStep.SummaryPageType; import org.wikipedia.miner.extract.util.Languages; import org.wikipedia.miner.extract.util.Languages.Language; import org.wikipedia.miner.extract.util.PageSentenceExtractor; import org.wikipedia.miner.extract.util.SiteInfo; import org.wikipedia.miner.model.Page.PageType; import org.wikipedia.miner.util.MarkupStripper; public class InitialMapper extends MapReduceBase implements Mapper<LongWritable, Text, AvroKey<PageKey>, AvroValue<PageDetail>> { private static Logger logger = Logger.getLogger(InitialMapper.class) ; private Language language ; private SiteInfo siteInfo ; private DumpPageParser pageParser ; private DumpLinkParser linkParser ; private MarkupStripper stripper = new MarkupStripper() ; private PageSentenceExtractor sentenceExtractor ; private String[] debugTitles = {"Atheist","Atheism","Atheists","Athiest","People by religion"} ; @Override public void configure(JobConf job) { try { language = null ; siteInfo = null ; Path[] cacheFiles = DistributedCache.getLocalCacheFiles(job); for (Path cf:cacheFiles) { if (cf.getName().equals(new Path(DumpExtractor.OUTPUT_SITEINFO).getName())) { siteInfo = SiteInfo.load(new File(cf.toString())) ; } if (cf.getName().equals(new Path(job.get(DumpExtractor.KEY_LANG_FILE)).getName())) { language = Languages.load(new File(cf.toString())).get(job.get(DumpExtractor.KEY_LANG_CODE)) ; } if (cf.getName().equals(new Path(job.get(DumpExtractor.KEY_SENTENCE_MODEL)).getName())) { sentenceExtractor = new PageSentenceExtractor(cf) ; } } if (siteInfo == null) throw new Exception("Could not locate '" + DumpExtractor.OUTPUT_SITEINFO + "' in DistributedCache") ; if (language == null) throw new Exception("Could not locate '" + job.get(DumpExtractor.KEY_LANG_FILE) + "' in DistributedCache") ; pageParser = new DumpPageParser(language, siteInfo) ; linkParser = new DumpLinkParser(language, siteInfo) ; //rootCategoryTitle = Util.normaliseTitle(languageConfig.getRootCategoryName()) ; } catch (Exception e) { logger.error("Could not configure mapper", e); } } public void map(LongWritable key, Text value, OutputCollector<AvroKey<PageKey>, AvroValue<PageDetail>> collector, Reporter reporter) throws IOException { DumpPage parsedPage = null ; try { parsedPage = pageParser.parsePage(value.toString()) ; } catch (Exception e) { reporter.incrCounter(SummaryPageType.unparseable, 1); logger.error("Could not parse dump page " , e) ; } if (parsedPage == null) return ; switch (parsedPage.getType()) { case article : reporter.incrCounter(SummaryPageType.article, 1); handleArticleOrCategory(parsedPage, collector, reporter) ; break ; case category : reporter.incrCounter(SummaryPageType.category, 1); handleArticleOrCategory(parsedPage, collector, reporter) ; break ; case disambiguation : reporter.incrCounter(SummaryPageType.disambiguation, 1); //apart from the counting, don't treat disambig pages any different from ordinary articles handleArticleOrCategory(parsedPage, collector, reporter) ; break ; case redirect : if (parsedPage.getNamespace().getKey() == SiteInfo.MAIN_KEY) reporter.incrCounter(SummaryPageType.articleRedirect, 1); if (parsedPage.getNamespace().getKey() == SiteInfo.CATEGORY_KEY) reporter.incrCounter(SummaryPageType.categoryRedirect, 1); handleRedirect(parsedPage, collector, reporter) ; break ; default: //for all other page types (e.g. templates) do nothing return ; } } private PageKey buildKey(DumpPage parsedPage) { PageKey key = new PageKey() ; key.setNamespace(parsedPage.getNamespace().getKey()); key.setTitle(parsedPage.getTitle()); return key ; } private PageDetail buildBasePageDetails(DumpPage parsedPage) { PageDetail page = PageSummaryStep.buildEmptyPageDetail() ; page.setId(parsedPage.getId()); if (parsedPage.getType().equals(PageType.disambiguation)) page.setIsDisambiguation(true); //note: we don't set namespace or title, because these will be found in page keys (so it would be wasteful to repeat them) if (parsedPage.getTarget() != null) page.setRedirectsTo(new PageSummary(-1,parsedPage.getTarget(), parsedPage.getNamespace().getKey(), false)); if (parsedPage.getLastEdited() != null) page.setLastEdited(parsedPage.getLastEdited().getTime()); return page ; } private PageSummary buildPageSummary(DumpPage parsedPage) { PageSummary summary = new PageSummary() ; summary.setId(parsedPage.getId()); summary.setNamespace(parsedPage.getNamespace().getKey()); summary.setTitle(parsedPage.getTitle()); return summary ; } /* private void handleCategory(DumpPage parsedPage, OutputCollector<AvroKey<PageKey>, AvroValue<PageDetail>> collector, Reporter reporter) throws IOException { PageDetail page = buildBasePageDetails(parsedPage) ; collect(new PageKey(page.getNamespace(), page.getTitle()), page, collector) ; if (page.getTitle().equals(rootCategoryTitle)) { logger.info("Root category id: " + parsedPage.getId()) ; } handleLinks(page, parsedPage.getMarkup(), collector, reporter) ; }*/ private void handleArticleOrCategory(DumpPage parsedPage, OutputCollector<AvroKey<PageKey>, AvroValue<PageDetail>> collector, Reporter reporter) throws IOException { boolean debug = false ; for(String debugTitle:debugTitles) { if (parsedPage.getTitle().equalsIgnoreCase(debugTitle)) debug = true ; } PageKey key = buildKey(parsedPage) ; PageDetail page = buildBasePageDetails(parsedPage) ; try { List<Integer> sentenceSplits = sentenceExtractor.getSentenceSplits(parsedPage) ; page.setSentenceSplits(sentenceSplits); } catch (Exception e) { logger.warn("Could not gather sentence splits for " + parsedPage.getTitle(), e) ; logger.info(parsedPage.getMarkup()); } collect(key, page, collector) ; handleLinks(key, page, parsedPage.getMarkup(), collector, reporter) ; if (debug) logger.info(page); } private void handleRedirect(DumpPage parsedPage, OutputCollector<AvroKey<PageKey>, AvroValue<PageDetail>> collector, Reporter reporter) throws IOException { PageDetail page = buildBasePageDetails(parsedPage) ; collect(buildKey(parsedPage), page, collector) ; String targetTitle = parsedPage.getTarget() ; // emit a pair to associate this redirect with target PageSummary source = buildPageSummary(parsedPage) ; source.setForwarded(false) ; PageDetail target = PageSummaryStep.buildEmptyPageDetail() ; target.getRedirects().add(source); collect(new PageKey(parsedPage.getNamespace().getKey(), targetTitle), target, collector) ; } public void handleLinks(PageKey key, PageDetail page, String markup, OutputCollector<AvroKey<PageKey>, AvroValue<PageDetail>> collector, Reporter reporter) throws IOException { String strippedMarkup = null ; try { strippedMarkup = stripper.stripAllButInternalLinksAndEmphasis(markup, ' ') ; } catch (Exception e) { logger.warn("Could not process link markup for " + page.getId() + ":" + key.getTitle()); return ; } Vector<int[]> linkRegions = stripper.gatherComplexRegions(strippedMarkup, "\\[\\[", "\\]\\]") ; Map<String,PageDetail> linkTargets = new HashMap<String,PageDetail>() ; Map<String,PageDetail> categoryParents = new HashMap<String,PageDetail>() ; for(int[] linkRegion: linkRegions) { String linkMarkup = strippedMarkup.substring(linkRegion[0]+2, linkRegion[1]-2) ; DumpLink link = null ; try { link = linkParser.parseLink(linkMarkup, key.getTitle().toString()) ; } catch (Exception e) { logger.warn("Could not parse link markup '" + linkMarkup + "'") ; } if (link == null) continue ; if (link.getTargetLanguage() != null) { //logger.info("Language link: " + linkMarkup); //TODO: how do we get translations now? continue ; } if (link.getTargetNamespace().getKey()==SiteInfo.CATEGORY_KEY) { String parentTitle = link.getTargetTitle() ; PageDetail parent = buildCategoryParent(key, page, link) ; if (parent != null) categoryParents.put(parentTitle, parent) ; } if (link.getTargetNamespace().getKey()==SiteInfo.MAIN_KEY) { String targetTitle = link.getTargetTitle() ; PageDetail target = linkTargets.get(targetTitle) ; if (target == null) target = PageSummaryStep.buildEmptyPageDetail() ; target = buildLinkTarget(key, page, link, linkRegion[0], target) ; linkTargets.put(targetTitle, target) ; if (link.getAnchor().contains("|")) logger.warn("weird link in " + key.getTitle() + ": \"" + linkMarkup + "\""); } } //emit collected link targets for (Map.Entry<String,PageDetail> e:linkTargets.entrySet()) { PageKey targetKey = new PageKey(SiteInfo.MAIN_KEY, e.getKey()) ; collect(targetKey, e.getValue(), collector) ; } //emit collected category parents for (Map.Entry<String,PageDetail> e:categoryParents.entrySet()) { PageKey parentKey = new PageKey(SiteInfo.CATEGORY_KEY, e.getKey()) ; collect(parentKey, e.getValue(), collector) ; } } private PageDetail buildLinkTarget(PageKey currKey, PageDetail currPage, DumpLink link, int linkStart, PageDetail targetPage) { /* emit details of this link, so it can be picked up by target the details we want to emit are * that the current page is the source of the link, * the sentence index within the current page that this link is found * the link anchor text used for the link (i.e. the label) */ //basics about the link source LinkSummary source ; if (targetPage.getLinksIn().isEmpty()) { source = new LinkSummary() ; source.setId(currPage.getId()); source.setTitle(currKey.getTitle()); source.setNamespace(currKey.getNamespace()) ; source.setForwarded(false) ; source.setSentenceIndexes(new ArrayList<Integer>()); targetPage.getLinksIn().add(source) ; } else { source = targetPage.getLinksIn().get(0) ; } //sentence index of the link int sentenceIndex = Collections.binarySearch(currPage.getSentenceSplits(), linkStart) ; if (sentenceIndex < 0) sentenceIndex = ((1-sentenceIndex) - 1) ; source.getSentenceIndexes().add(sentenceIndex) ; //the anchor text of the link LabelSummary label = targetPage.getLabels().get(link.getAnchor()) ; if (label == null) { label = new LabelSummary() ; label.setDocCount(1) ; label.setOccCount(1) ; targetPage.getLabels().put(link.getAnchor(), label) ; } else { label.setOccCount(label.getOccCount() + 1); } //associate everything with target of link //PageDetail target = PageSummaryStep.buildEmptyPageDetail() ; //target.getLinksIn().add(source) ; //target.getLabels().add(label) ; return targetPage ; } /** * This will emit a pair that will associate this current page as the child of the target category. * The link will need to be backtracked before we can register the target as a parent of the source. * It may also need to be forwarded via any redirects * * @param currPage * @param link * @throws IOException */ private PageDetail buildCategoryParent(PageKey currKey, PageDetail currPage, DumpLink link) { //emit details of this link, so it can be picked up by target PageSummary child = new PageSummary() ; child.setId(currPage.getId()); child.setTitle(currKey.getTitle()); child.setNamespace(currKey.getNamespace()) ; child.setForwarded(false) ; PageDetail parent = PageSummaryStep.buildEmptyPageDetail() ; if (currKey.getNamespace() == SiteInfo.CATEGORY_KEY) parent.getChildCategories().add(child); else if (currKey.getNamespace() == SiteInfo.MAIN_KEY) parent.getChildArticles().add(child); else return null ; return parent ; } private void collect(PageKey key, PageDetail value, OutputCollector<AvroKey<PageKey>, AvroValue<PageDetail>> collector) throws IOException { AvroKey<PageKey> k = new AvroKey<PageKey>(key) ; AvroValue<PageDetail> v = new AvroValue<PageDetail>(value) ; collector.collect(k,v) ; } }