package org.wikipedia.miner.extract.steps.pageSummary;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map;
import org.apache.avro.mapred.AvroJob;
import org.apache.avro.mapred.AvroKey;
import org.apache.avro.mapred.AvroValue;
import org.apache.avro.mapred.Pair;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.Counters;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.JobStatus;
import org.apache.hadoop.mapred.RunningJob;
import org.apache.log4j.Logger;
import org.wikipedia.miner.extract.DumpExtractor;
import org.wikipedia.miner.extract.model.struct.LabelSummary;
import org.wikipedia.miner.extract.model.struct.LinkSummary;
import org.wikipedia.miner.extract.model.struct.PageDetail;
import org.wikipedia.miner.extract.model.struct.PageKey;
import org.wikipedia.miner.extract.model.struct.PageSummary;
import org.wikipedia.miner.extract.steps.IterativeStep;
import org.wikipedia.miner.extract.steps.pageSummary.CombinerOrReducer.Combiner;
import org.wikipedia.miner.extract.steps.pageSummary.CombinerOrReducer.Reducer;
import org.wikipedia.miner.extract.util.UncompletedStepException;
import org.wikipedia.miner.extract.util.XmlInputFormat;
/**
* @author dmilne
*
* This step produces PageDetail structures.
* It needs to be run multiple times for the PageDetail structures to be completed (they get built gradually).
*
* Completion is indicated when all Unforwarded counters reach 0.
*
* The number of iterations needed is bounded by the longest chain of redirects
* (i.e. a redirect pointing to a redirect pointing to a redirect pointing to...)
*
* The first iteration reads directly from the xml dump.
* Subsequent iterations read from the results of the previous iteration.
*
* The page summaries will be missing namespace and title fields, because they are found in the page keys (so repeating them would be wasteful)
*
*/
public class PageSummaryStep extends IterativeStep {
private static Logger logger = Logger.getLogger(PageSummaryStep.class) ;
public enum SummaryPageType {article, category, disambiguation, articleRedirect, categoryRedirect, unparseable} ;
public enum Unforwarded {redirect,linkIn,linkOut,parentCategory,childCategory,childArticle} ;
private Map<Unforwarded,Long> unforwardedCounts ;
public PageSummaryStep(Path workingDir, int iteration) throws IOException {
super(workingDir, iteration);
}
public boolean furtherIterationsRequired() {
for (Long count:unforwardedCounts.values()) {
if (count > 0)
return true ;
}
return false ;
}
public static PageSummary clone(PageSummary summary) {
return PageSummary.newBuilder(summary).build() ;
}
public static LinkSummary clone(LinkSummary summary) {
return LinkSummary.newBuilder(summary).build() ;
}
public static PageDetail clone(PageDetail pageDetail) {
return PageDetail.newBuilder(pageDetail).build() ;
}
public static PageDetail buildEmptyPageDetail() {
PageDetail p = new PageDetail() ;
p.setIsDisambiguation(false);
p.setSentenceSplits(new ArrayList<Integer>());
p.setRedirects(new ArrayList<PageSummary>()) ;
p.setLinksIn(new ArrayList<LinkSummary>());
p.setLinksOut(new ArrayList<LinkSummary>());
p.setParentCategories(new ArrayList<PageSummary>());
p.setChildCategories(new ArrayList<PageSummary>());
p.setChildArticles(new ArrayList<PageSummary>());
p.setLabels(new HashMap<CharSequence,LabelSummary>()) ;
return p ;
}
@Override
public int run(String[] args) throws UncompletedStepException, IOException {
logger.info("Starting page summary step (iteration " + getIteration() + ")");
if (isFinished()) {
logger.info(" - already completed");
loadUnforwardedCounts() ;
return 0 ;
} else
reset() ;
JobConf conf = new JobConf(PageSummaryStep.class);
DumpExtractor.configureJob(conf, args) ;
conf.setJobName("WM: page summary (" + getIteration() + ")");
if (getIteration() == 0) {
conf.setMapperClass(InitialMapper.class);
conf.setOutputKeyClass(AvroKey.class);
conf.setOutputValueClass(AvroValue.class);
conf.setInputFormat(XmlInputFormat.class);
conf.set(XmlInputFormat.START_TAG_KEY, "<page>") ;
conf.set(XmlInputFormat.END_TAG_KEY, "</page>") ;
FileInputFormat.setInputPaths(conf, conf.get(DumpExtractor.KEY_INPUT_FILE));
DistributedCache.addCacheFile(new Path(conf.get(DumpExtractor.KEY_SENTENCE_MODEL)).toUri(), conf);
} else {
AvroJob.setMapperClass(conf, SubsequentMapper.class);
AvroJob.setInputSchema(conf, Pair.getPairSchema(PageKey.getClassSchema(),PageDetail.getClassSchema()));
FileInputFormat.setInputPaths(conf, getWorkingDir() + Path.SEPARATOR + "pageSummary_" + (getIteration()-1));
}
DistributedCache.addCacheFile(new Path(conf.get(DumpExtractor.KEY_OUTPUT_DIR) + "/" + DumpExtractor.OUTPUT_SITEINFO).toUri(), conf);
DistributedCache.addCacheFile(new Path(conf.get(DumpExtractor.KEY_LANG_FILE)).toUri(), conf);
AvroJob.setCombinerClass(conf, Combiner.class) ;
AvroJob.setReducerClass(conf, Reducer.class);
AvroJob.setOutputSchema(conf, Pair.getPairSchema(PageKey.getClassSchema(),PageDetail.getClassSchema()));
FileOutputFormat.setOutputPath(conf, getDir());
RunningJob runningJob = JobClient.runJob(conf);
if (runningJob.getJobState() == JobStatus.SUCCEEDED) {
finish(runningJob) ;
return 0 ;
}
throw new UncompletedStepException() ;
}
@Override
public String getDirName(int iteration) {
return "pageSummary_" + iteration ;
}
private Path getUnforwardedCountsPath() {
return new Path(getDir() + Path.SEPARATOR + "unforwarded") ;
}
private void saveUnforwardedCounts() throws IOException {
FSDataOutputStream out = getHdfs().create(getUnforwardedCountsPath());
for (Unforwarded u:Unforwarded.values()) {
out.writeUTF(u.name()) ;
Long count = unforwardedCounts.get(u) ;
if (count != null)
out.writeLong(count) ;
else
out.writeLong(0L) ;
}
out.close();
}
private void loadUnforwardedCounts() throws IOException {
unforwardedCounts = new HashMap<Unforwarded,Long>() ;
FSDataInputStream in = getHdfs().open(getUnforwardedCountsPath());
while (in.available() > 0) {
String u = in.readUTF() ;
Long count = in.readLong() ;
unforwardedCounts.put(Unforwarded.valueOf(u), count) ;
}
in.close() ;
}
public void finish(RunningJob runningJob) throws IOException {
super.finish(runningJob) ;
unforwardedCounts = new HashMap<Unforwarded,Long>() ;
for (Unforwarded u:Unforwarded.values()) {
Counters.Counter counter = runningJob.getCounters().findCounter(u) ;
if (counter != null)
unforwardedCounts.put(u, counter.getCounter()) ;
else
unforwardedCounts.put(u, 0L) ;
}
saveUnforwardedCounts() ;
}
}