package org.wikipedia.miner.extract.steps.pageDepth; import java.io.IOException; import java.util.HashMap; import java.util.Map; import org.apache.avro.Schema; import org.apache.avro.Schema.Type; import org.apache.avro.mapred.AvroJob; import org.apache.avro.mapred.Pair; import org.apache.hadoop.filecache.DistributedCache; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.Path; import org.apache.hadoop.mapred.Counters; import org.apache.hadoop.mapred.FileInputFormat; import org.apache.hadoop.mapred.FileOutputFormat; import org.apache.hadoop.mapred.JobClient; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.JobStatus; import org.apache.hadoop.mapred.RunningJob; import org.apache.log4j.Logger; import org.wikipedia.miner.extract.DumpExtractor; import org.wikipedia.miner.extract.model.struct.PageDepthSummary; import org.wikipedia.miner.extract.model.struct.PageDetail; import org.wikipedia.miner.extract.steps.IterativeStep; import org.wikipedia.miner.extract.steps.pageDepth.DepthCombinerOrReducer.Counts; import org.wikipedia.miner.extract.steps.pageDepth.DepthCombinerOrReducer.DepthCombiner; import org.wikipedia.miner.extract.steps.pageDepth.DepthCombinerOrReducer.DepthReducer; import org.wikipedia.miner.extract.steps.sortedPages.PageSortingStep; import org.wikipedia.miner.extract.util.UncompletedStepException; public class PageDepthStep extends IterativeStep { private static Logger logger = Logger.getLogger(PageDepthStep.class) ; private PageSortingStep finalPageSummaryStep ; private Map<Counts,Long> counts ; public PageDepthStep(Path workingDir, int iteration, PageSortingStep finalPageSummaryStep) throws IOException { super(workingDir, iteration); this.finalPageSummaryStep = finalPageSummaryStep ; } @Override public int run(String[] args) throws Exception { logger.info("Starting page depth step (iteration " + getIteration() + ")"); if (isFinished()) { logger.info(" - already completed"); loadCounts() ; return 0 ; } else { reset() ; } JobConf conf = new JobConf(PageDepthStep.class); DumpExtractor.configureJob(conf, args) ; conf.setJobName("WM: page depth (" + getIteration() + ")"); if (getIteration() == 0) { FileInputFormat.setInputPaths(conf, getWorkingDir() + Path.SEPARATOR + finalPageSummaryStep.getDirName()); AvroJob.setInputSchema(conf, Pair.getPairSchema(Schema.create(Type.INT),PageDetail.getClassSchema())); DistributedCache.addCacheFile(new Path(conf.get(DumpExtractor.KEY_LANG_FILE)).toUri(), conf); AvroJob.setMapperClass(conf, InitialDepthMapper.class); } else { FileInputFormat.setInputPaths(conf, getWorkingDir() + Path.SEPARATOR + getDirName(getIteration()-1)); AvroJob.setInputSchema(conf, Pair.getPairSchema(Schema.create(Type.INT),PageDepthSummary.getClassSchema())); AvroJob.setMapperClass(conf, SubsequentDepthMapper.class); } AvroJob.setOutputSchema(conf, Pair.getPairSchema(Schema.create(Type.INT),PageDepthSummary.getClassSchema())); AvroJob.setCombinerClass(conf, DepthCombiner.class) ; AvroJob.setReducerClass(conf, DepthReducer.class); FileOutputFormat.setOutputPath(conf, getDir()); RunningJob runningJob = JobClient.runJob(conf); if (runningJob.getJobState() == JobStatus.SUCCEEDED) { finish(runningJob) ; return 0 ; } throw new UncompletedStepException() ; } public boolean furtherIterationsRequired() { return counts.get(Counts.unforwarded) > 0 ; } @Override public String getDirName(int iteration) { return "pageDepth_" + iteration ; } private Path getCountsPath() { return new Path(getDir() + Path.SEPARATOR + "counts") ; } private void saveCounts() throws IOException { FSDataOutputStream out = getHdfs().create(getCountsPath()); for (Counts c:Counts.values()) { out.writeUTF(c.name()) ; Long count = counts.get(c) ; if (count != null) out.writeLong(count) ; else out.writeLong(0L) ; } out.close(); } private void loadCounts() throws IOException { counts = new HashMap<Counts,Long>() ; FSDataInputStream in = getHdfs().open(getCountsPath()); while (in.available() > 0) { String c = in.readUTF() ; Long count = in.readLong() ; counts.put(Counts.valueOf(c), count) ; } in.close() ; } public void finish(RunningJob runningJob) throws IOException { super.finish(runningJob) ; counts = new HashMap<Counts,Long>() ; for (Counts count:Counts.values()) { Counters.Counter counter = runningJob.getCounters().findCounter(count) ; if (counter != null) counts.put(count, counter.getCounter()) ; else counts.put(count, 0L) ; } saveCounts() ; } }