package org.wikipedia.miner.extract.steps.sortedPages; import java.io.IOException; import org.apache.avro.Schema; import org.apache.avro.Schema.Type; import org.apache.avro.mapred.AvroCollector; import org.apache.avro.mapred.AvroJob; import org.apache.avro.mapred.AvroMapper; import org.apache.avro.mapred.AvroReducer; import org.apache.avro.mapred.Pair; import org.apache.hadoop.fs.Path; import org.apache.hadoop.mapred.FileInputFormat; import org.apache.hadoop.mapred.FileOutputFormat; import org.apache.hadoop.mapred.JobClient; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.JobStatus; import org.apache.hadoop.mapred.Reporter; import org.apache.hadoop.mapred.RunningJob; import org.apache.log4j.Logger; import org.wikipedia.miner.extract.DumpExtractor; import org.wikipedia.miner.extract.model.struct.PageDetail; import org.wikipedia.miner.extract.model.struct.PageKey; import org.wikipedia.miner.extract.steps.Step; import org.wikipedia.miner.extract.steps.pageDepth.PageDepthStep; import org.wikipedia.miner.extract.steps.pageSummary.PageSummaryStep; import org.wikipedia.miner.extract.util.UncompletedStepException; /** * In this step we sort page summaries produced by PageSummaryStep by id (they were previously sorted by namespace:title) * We also inject titles and namespaces into each page summary (they were previously omitted because they are found in keys, and repeating would be wasteful) */ public class PageSortingStep extends Step { private static Logger logger = Logger.getLogger(PageSortingStep.class) ; PageSummaryStep finalPageSummaryStep ; public PageSortingStep(Path workingDir, PageSummaryStep finalPageSummaryStep) throws IOException { super(workingDir); this.finalPageSummaryStep = finalPageSummaryStep ; } @Override public int run(String[] args) throws Exception { logger.info("Starting page sorting step"); if (isFinished()) { logger.info(" - already completed"); return 0 ; } else { reset() ; } JobConf conf = new JobConf(PageDepthStep.class); DumpExtractor.configureJob(conf, args) ; conf.setJobName("WM: sorted pages"); FileInputFormat.setInputPaths(conf, getWorkingDir() + Path.SEPARATOR + finalPageSummaryStep.getDirName()); AvroJob.setInputSchema(conf, Pair.getPairSchema(PageKey.getClassSchema(),PageDetail.getClassSchema())); AvroJob.setOutputSchema(conf, Pair.getPairSchema(Schema.create(Type.INT),PageDetail.getClassSchema())); AvroJob.setMapperClass(conf, Mapper.class); AvroJob.setReducerClass(conf, Reducer.class); FileOutputFormat.setOutputPath(conf, getDir()); RunningJob runningJob = JobClient.runJob(conf); if (runningJob.getJobState() == JobStatus.SUCCEEDED) { finish(runningJob) ; return 0 ; } throw new UncompletedStepException() ; } @Override public String getDirName() { return "sortedPages" ; } public static class Mapper extends AvroMapper<Pair<PageKey, PageDetail>, Pair<Integer, PageDetail>>{ @Override public void map(Pair<PageKey, PageDetail> pair, AvroCollector<Pair<Integer, PageDetail>> collector, Reporter reporter) throws IOException { PageKey key = pair.key() ; PageDetail page = pair.value() ; page.setNamespace(key.getNamespace()); page.setTitle(key.getTitle()); collector.collect(new Pair<Integer, PageDetail>(page.getId(), page)); } } public static class Reducer extends AvroReducer<Integer, PageDetail, Pair<Integer,PageDetail>>{ @Override public void reduce(Integer pageId, Iterable<PageDetail> pages, AvroCollector<Pair<Integer,PageDetail>> collector, Reporter reporter) throws IOException { for (PageDetail page:pages) collector.collect(new Pair<Integer, PageDetail>(page.getId(), page)); } } }