package edu.umd.hooka; import edu.umd.hooka.alignment.IndexedFloatArray; import java.io.IOException; import java.util.ArrayList; import java.util.Iterator; import java.util.Map; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.FloatWritable; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.mapred.FileInputFormat; import org.apache.hadoop.mapred.FileOutputFormat; import org.apache.hadoop.mapred.JobClient; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.MapReduceBase; import org.apache.hadoop.mapred.Mapper; import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.Reducer; import org.apache.hadoop.mapred.Reporter; import org.apache.hadoop.mapred.SequenceFileInputFormat; import org.apache.hadoop.mapred.SequenceFileOutputFormat; public class PhraseExtractAndCount_stripe { public static class PhrasePairExtractMapper extends MapReduceBase implements Mapper<IntWritable, PhrasePair, Phrase, Phrase2CountMap> { private final Phrase2CountMap pcm = new Phrase2CountMap(); private final static FloatWritable one = new FloatWritable(1.0f); public void map(IntWritable key, PhrasePair value, OutputCollector<Phrase, Phrase2CountMap> output, Reporter reporter) throws IOException { ArrayList<PhrasePair> extracts = value.extractConsistentPhrasePairs(7); for (PhrasePair p : extracts) { pcm.clear(); pcm.put(p.getF(), one); output.collect(p.getE(), pcm); pcm.clear(); pcm.put(p.getE(), one); output.collect(p.getF(), pcm); } } } public static class PPCountCombiner extends MapReduceBase implements Reducer<Phrase, Phrase2CountMap, Phrase, Phrase2CountMap> { Phrase2CountMap sum = new Phrase2CountMap(); public void reduce(Phrase key, Iterator<Phrase2CountMap> values, OutputCollector<Phrase,Phrase2CountMap> output, Reporter reporter) throws IOException { sum.clear(); while (values.hasNext()) { sum.plusEquals(values.next()); } output.collect(key, sum); } } public static class PPNormalizingReducer extends MapReduceBase implements Reducer<Phrase, Phrase2CountMap, PhrasePair, IndexedFloatArray> { Phrase2CountMap sum = new Phrase2CountMap(); PhrasePair ko = new PhrasePair(); IndexedFloatArray scores = new IndexedFloatArray(2); public void reduce(Phrase key, Iterator<Phrase2CountMap> values, OutputCollector<PhrasePair,IndexedFloatArray> output, Reporter reporter) throws IOException { sum.clear(); int sc = 0; while (values.hasNext()) { sc++; if (sc % 1000 == 0) { reporter.progress(); } sum.plusEquals(values.next()); } sum.normalize(); boolean transpose = (key.getLanguage() == 0); if (transpose) ko.setE(key); else ko.setF(key); for (Map.Entry<Phrase,FloatWritable> i : sum.entrySet()) { scores.clear(); if (transpose) { ko.setF(i.getKey()); scores.set(1, i.getValue().get()); } else { ko.setE(i.getKey()); scores.set(0, i.getValue().get()); } output.collect(ko, scores); } } } public static class ReduceSumScores extends MapReduceBase implements Reducer<PhrasePair, IndexedFloatArray, PhrasePair, IndexedFloatArray> { IndexedFloatArray scores = new IndexedFloatArray(2); public void reduce(PhrasePair key, Iterator<IndexedFloatArray> values, OutputCollector<PhrasePair, IndexedFloatArray> output, Reporter reporter) throws IOException { scores.clear(); while (values.hasNext()) { scores.plusEquals(values.next()); } output.collect(key, scores); } } @SuppressWarnings("deprecation") public static void main(String[] args) throws IOException { CorpusInfo corpus = CorpusInfo.getCorpus(CorpusInfo.Corpus.ARABIC_5000k); Path ppCountTemp= new Path("ppc.phase1.tmp"); int mapTasks = 38; int reduceTasks = 38; JobConf conf = new JobConf(PhraseExtractAndCount_stripe.class); conf.setJobName("BuildPT.ExtractAndCount_striped"); FileSystem.get(conf).delete(ppCountTemp); FileSystem.get(conf).delete(corpus.getLocalPhraseTable()); conf.setOutputKeyClass(PhrasePair.class); conf.setOutputValueClass(IndexedFloatArray.class); conf.setMapOutputKeyClass(Phrase.class); conf.setMapOutputValueClass(Phrase2CountMap.class); conf.setMapperClass(PhrasePairExtractMapper.class); conf.setCombinerClass(PPCountCombiner.class); conf.setReducerClass(PPNormalizingReducer.class); conf.setNumMapTasks(mapTasks); conf.setNumReduceTasks(reduceTasks); FileInputFormat.setInputPaths(conf, corpus.getAlignedBitext()); FileOutputFormat.setOutputPath(conf, ppCountTemp); conf.setInputFormat(SequenceFileInputFormat.class); conf.setOutputFormat(SequenceFileOutputFormat.class); JobClient.runJob(conf); conf = new JobConf(PhraseExtractAndCount_stripe.class); conf.setJobName("BuildPT.Merge"); conf.setInputFormat(SequenceFileInputFormat.class); conf.setOutputKeyClass(PhrasePair.class); conf.setOutputValueClass(IndexedFloatArray.class); conf.setReducerClass(ReduceSumScores.class); conf.setNumMapTasks(mapTasks); conf.setNumReduceTasks(reduceTasks); FileInputFormat.setInputPaths(conf, ppCountTemp); FileOutputFormat.setOutputPath(conf, corpus.getLocalPhraseTable()); JobClient.runJob(conf); } }