package org.wikipedia.miner.extract.steps.labelOccurrences;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Vector;
import opennlp.tools.tokenize.SimpleTokenizer;
import opennlp.tools.tokenize.Tokenizer;
import opennlp.tools.util.Span;
import org.apache.avro.mapred.AvroKey;
import org.apache.avro.mapred.AvroValue;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import org.apache.log4j.Logger;
import org.wikipedia.miner.extract.DumpExtractor;
import org.wikipedia.miner.extract.model.DumpLink;
import org.wikipedia.miner.extract.model.DumpLinkParser;
import org.wikipedia.miner.extract.model.DumpPage;
import org.wikipedia.miner.extract.model.DumpPageParser;
import org.wikipedia.miner.extract.model.struct.LabelOccurrences;
import org.wikipedia.miner.extract.util.Languages;
import org.wikipedia.miner.extract.util.Languages.Language;
import org.wikipedia.miner.extract.util.PageSentenceExtractor;
import org.wikipedia.miner.extract.util.SiteInfo;
import org.wikipedia.miner.util.MarkupStripper;
public class Mapper implements org.apache.hadoop.mapred.Mapper<LongWritable, Text, AvroKey<CharSequence>, AvroValue<LabelOccurrences>> {
private static Logger logger = Logger.getLogger(Mapper.class) ;
private Language language ;
private SiteInfo siteInfo ;
private DumpPageParser pageParser ;
private DumpLinkParser linkParser ;
private MarkupStripper stripper = new MarkupStripper() ;
private PageSentenceExtractor sentenceExtractor ;
private int totalLabels ;
private List<Path> labelPaths ;
LabelCache labelCache ;
@Override
public void configure(JobConf job) {
try {
language = null ;
siteInfo = null ;
labelPaths = new ArrayList<Path>() ;
Path[] cacheFiles = DistributedCache.getLocalCacheFiles(job);
for (Path cf:cacheFiles) {
if (cf.getName().equals(new Path(DumpExtractor.OUTPUT_SITEINFO).getName())) {
siteInfo = SiteInfo.load(new File(cf.toString())) ;
} else if (cf.getName().equals(new Path(job.get(DumpExtractor.KEY_LANG_FILE)).getName())) {
language = Languages.load(new File(cf.toString())).get(job.get(DumpExtractor.KEY_LANG_CODE)) ;
} else if (cf.getName().equals(new Path(job.get(DumpExtractor.KEY_SENTENCE_MODEL)).getName())) {
sentenceExtractor = new PageSentenceExtractor(cf) ;
} else {
//assume this contains the labels and senses
labelPaths.add(cf) ;
}
}
if (siteInfo == null)
throw new Exception("Could not locate '" + DumpExtractor.OUTPUT_SITEINFO + "' in DistributedCache") ;
if (language == null)
throw new Exception("Could not locate '" + job.get(DumpExtractor.KEY_LANG_FILE) + "' in DistributedCache") ;
if (labelPaths.isEmpty())
throw new Exception("Could not locate any label files in DistributedCache") ;
pageParser = new DumpPageParser(language, siteInfo) ;
linkParser = new DumpLinkParser(language, siteInfo) ;
totalLabels = job.getInt(LabelOccurrenceStep.KEY_TOTAL_LABELS, 0) ;
if (totalLabels == 0)
throw new Exception("Could not retrieve total number of labels") ;
} catch (Exception e) {
logger.error("Could not configure mapper", e);
}
labelCache = LabelCache.get();
}
public void map(LongWritable key, Text value, OutputCollector<AvroKey<CharSequence>, AvroValue<LabelOccurrences>> collector, Reporter reporter) throws IOException {
if (!labelCache.isLoaded())
labelCache.load(labelPaths, totalLabels, reporter);
DumpPage parsedPage = null ;
try {
parsedPage = pageParser.parsePage(value.toString()) ;
} catch (Exception e) {
//reporter.incrCounter(PageType.unparseable, 1);
logger.error("Could not parse dump page " , e) ;
}
if (parsedPage == null)
return ;
//only care about articles
if (parsedPage.getNamespace().getKey() != SiteInfo.MAIN_KEY)
return ;
//dont care about redirects
if (parsedPage.getTarget() != null)
return ;
Map<CharSequence,LabelOccurrences> labels = new HashMap<CharSequence,LabelOccurrences>() ;
String markup = parsedPage.getMarkup() ;
try {
markup = stripper.stripAllButInternalLinksAndEmphasis(markup, null) ;
//markup = stripper.stripEmphasis(markup, null) ;
} catch (Exception e) {
logger.error("Could not strip markup: " + markup);
return ;
}
labels = handleLinks(parsedPage, markup, labels, reporter) ;
markup = stripper.stripInternalLinks(markup, null) ;
int lastSplit = 0 ;
for (int split:sentenceExtractor.getSentenceSplits(markup)) {
labels = handleSentence(markup.substring(lastSplit, split), labels, reporter) ;
lastSplit = split ;
}
labels = handleSentence(markup.substring(lastSplit), labels, reporter) ;
for (Map.Entry<CharSequence, LabelOccurrences> e:labels.entrySet())
collector.collect(new AvroKey<CharSequence>(e.getKey()), new AvroValue<LabelOccurrences>(e.getValue()));
logger.info(parsedPage.getTitle() + ": " + labels.size() + " labels");
}
public Map<CharSequence,LabelOccurrences> handleLinks(DumpPage page, String markup, Map<CharSequence,LabelOccurrences> labels, Reporter reporter) {
//logger.info("markup: " + markup);
Vector<int[]> linkRegions = stripper.gatherComplexRegions(markup, "\\[\\[", "\\]\\]") ;
for(int[] linkRegion: linkRegions) {
reporter.progress();
String linkMarkup = markup.substring(linkRegion[0]+2, linkRegion[1]-2) ;
DumpLink link = null ;
try {
link = linkParser.parseLink(linkMarkup, page.getTitle()) ;
} catch (Exception e) {
logger.warn("Could not parse link markup '" + linkMarkup + "'") ;
}
if (link == null)
continue ;
if (link.getTargetLanguage() != null)
continue ;
if (link.getTargetNamespace().getKey() != SiteInfo.MAIN_KEY)
continue ;
LabelOccurrences lo = labels.get(link.getAnchor()) ;
if (lo == null)
lo = new LabelOccurrences(0,0,0,0) ;
lo.setLinkDocCount(1);
lo.setLinkOccCount(lo.getLinkOccCount() + 1);
labels.put(link.getAnchor(), lo) ;
}
return labels ;
}
public Map<CharSequence,LabelOccurrences> handleSentence(String sentence, Map<CharSequence,LabelOccurrences> labels, Reporter reporter) {
Tokenizer tokenizer = SimpleTokenizer.INSTANCE ;
Span[] spans = tokenizer.tokenizePos(sentence) ;
for (int startIndex=0 ; startIndex<spans.length ; startIndex++) {
reporter.progress();
for (int endIndex=startIndex ; endIndex < startIndex + labelCache.getMaxSensibleLabelLength() && endIndex < spans.length ; endIndex++) {
CharSequence label = sentence.substring(spans[startIndex].getStart(), spans[endIndex].getEnd()) ;
//logger.info(" - " + label);
if (!labelCache.mightContain(label))
continue ;
LabelOccurrences lo = labels.get(label) ;
if (lo == null)
lo = new LabelOccurrences(0,0,0,0) ;
lo.setTextDocCount(1);
lo.setTextOccCount(lo.getLinkOccCount() + 1);
labels.put(label, lo) ;
}
}
return labels ;
}
@Override
public void close() throws IOException {
// TODO Auto-generated method stub
}
}