package org.wikipedia.miner.extract.steps.primaryLabel; import java.io.IOException; import java.util.ArrayList; import org.apache.avro.Schema; import org.apache.avro.Schema.Type; import org.apache.avro.mapred.AvroCollector; import org.apache.avro.mapred.AvroJob; import org.apache.avro.mapred.AvroMapper; import org.apache.avro.mapred.AvroReducer; import org.apache.avro.mapred.Pair; import org.apache.hadoop.fs.Path; import org.apache.hadoop.mapred.FileInputFormat; import org.apache.hadoop.mapred.FileOutputFormat; import org.apache.hadoop.mapred.JobClient; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.JobStatus; import org.apache.hadoop.mapred.Reporter; import org.apache.hadoop.mapred.RunningJob; import org.apache.log4j.Logger; import org.wikipedia.miner.extract.DumpExtractor; import org.wikipedia.miner.extract.model.struct.LabelSense; import org.wikipedia.miner.extract.model.struct.LabelSenseList; import org.wikipedia.miner.extract.model.struct.PrimaryLabels; import org.wikipedia.miner.extract.steps.Step; import org.wikipedia.miner.extract.steps.labelSenses.LabelSensesStep; import org.wikipedia.miner.extract.steps.pageDepth.PageDepthStep; import org.wikipedia.miner.extract.util.UncompletedStepException; public class PrimaryLabelStep extends Step { private static Logger logger = Logger.getLogger(PrimaryLabelStep.class) ; private LabelSensesStep labelSensesStep ; public PrimaryLabelStep(Path workingDir, LabelSensesStep labelSensesStep) throws IOException { super(workingDir); this.labelSensesStep = labelSensesStep ; } @Override public int run(String[] args) throws Exception { logger.info("Starting primary label step"); if (isFinished()) { logger.info(" - already completed"); return 0 ; } else { reset() ; } JobConf conf = new JobConf(PageDepthStep.class); DumpExtractor.configureJob(conf, args) ; conf.setJobName("WM: primary labels"); FileInputFormat.setInputPaths(conf, getWorkingDir() + Path.SEPARATOR + labelSensesStep.getDirName()); AvroJob.setInputSchema(conf, Pair.getPairSchema(Schema.create(Type.STRING),LabelSenseList.getClassSchema())); AvroJob.setOutputSchema(conf, Pair.getPairSchema(Schema.create(Type.INT), PrimaryLabels.getClassSchema())); AvroJob.setMapperClass(conf, Mapper.class); AvroJob.setCombinerClass(conf, Reducer.class); AvroJob.setReducerClass(conf, Reducer.class); FileOutputFormat.setOutputPath(conf, getDir()); RunningJob runningJob = JobClient.runJob(conf); if (runningJob.getJobState() == JobStatus.SUCCEEDED) { finish(runningJob) ; return 0 ; } throw new UncompletedStepException() ; } @Override public String getDirName() { return "primaryLabels" ; } public static class Mapper extends AvroMapper<Pair<CharSequence, LabelSenseList>, Pair<Integer, PrimaryLabels>>{ @Override public void map(Pair<CharSequence, LabelSenseList> pair, AvroCollector<Pair<Integer, PrimaryLabels>> collector, Reporter reporter) throws IOException { CharSequence label = pair.key() ; LabelSenseList senses = pair.value() ; if (senses.getSenses().isEmpty()) return ; LabelSense firstSense = senses.getSenses().get(0) ; ArrayList<CharSequence> primaryLabels = new ArrayList<CharSequence>() ; primaryLabels.add(label) ; collector.collect(new Pair<Integer, PrimaryLabels>(firstSense.getId(), new PrimaryLabels(primaryLabels))); } } public static class Reducer extends AvroReducer<Integer, PrimaryLabels, Pair<Integer,PrimaryLabels>>{ @Override public void reduce(Integer pageId, Iterable<PrimaryLabels> partials, AvroCollector<Pair<Integer, PrimaryLabels>> collector, Reporter reporter) throws IOException { ArrayList<CharSequence> primaryLabels = new ArrayList<CharSequence>() ; for (PrimaryLabels partial:partials) { PrimaryLabels clone = PrimaryLabels.newBuilder(partial).build() ; primaryLabels.addAll(clone.getLabels()) ; } collector.collect(new Pair<Integer, PrimaryLabels>(pageId, new PrimaryLabels(primaryLabels))); } } }