package edu.umd.hooka.alignment; import edu.umd.hooka.alignment.model1.Model1; import edu.umd.hooka.ttables.TTable; import edu.umd.hooka.ttables.TTable_monolithic_IFAs; import java.io.BufferedInputStream; import java.io.DataInput; import java.io.DataInputStream; import java.io.IOException; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.Counters; import org.apache.hadoop.mapred.FileInputFormat; import org.apache.hadoop.mapred.FileOutputFormat; import org.apache.hadoop.mapred.JobClient; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.MapReduceBase; import org.apache.hadoop.mapred.Mapper; import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.Reporter; import org.apache.hadoop.mapred.RunningJob; import org.apache.hadoop.mapred.SequenceFileInputFormat; import edu.umd.hooka.Alignment; import edu.umd.hooka.PhrasePair; /** * Reads a bitext and generates a TTable object (serialized) based on the * (e,f) cooccurrences in the text. * * @author redpony * */ public class M1ViterbiExtract { //static final String bitext ="/shared/bitexts/small.ar-en.ldc/ar-en.bitext"; //static final String ttable ="/user/redpony/small.ar-en.ttable"; static final String bitext ="/shared/bitexts/hansards.fr-en/hansards.aachen.bitext"; static final String ttable ="/user/redpony/hansards.aachen.ttable"; static protected TTable loadTTable(Path path) throws IOException { org.apache.hadoop.conf.Configuration conf = new org.apache.hadoop.conf.Configuration(); FileSystem fileSys = FileSystem.get(conf); DataInput in = new DataInputStream(new BufferedInputStream(fileSys.open(path))); TTable_monolithic_IFAs tt = new TTable_monolithic_IFAs(); tt.readFields(in); return tt; } public static class M1ViterbiMapper extends MapReduceBase implements Mapper<IntWritable,PhrasePair,IntWritable,Text> { Text out = new Text(); PerplexityReporter cr = new PerplexityReporter(); Model1 m1 = null; public void map(IntWritable key, PhrasePair value, OutputCollector<IntWritable,Text> output, Reporter reporter) throws IOException { if (m1 == null) { Path pathTTable = new Path(ttable); TTable tt = loadTTable(pathTTable); m1 = new Model1(tt, true); } cr.reset(); Alignment a = m1.viterbiAlign(value, cr); out.set(a.toString()); output.collect(key, out); reporter.incrCounter(CrossEntropyCounters.LOGPROB, (long)(cr.getTotalLogProb())); reporter.incrCounter(CrossEntropyCounters.WORDCOUNT, cr.getTotalWordCount()); } public void close() { } } @SuppressWarnings("deprecation") public static void main(String[] args) throws IOException { int mapTasks = 15; JobConf conf = new JobConf(M1ViterbiMapper.class); conf.setJobName("m1viterbi"); conf.setOutputKeyClass(LongWritable.class); conf.setOutputValueClass(Text.class); conf.setMapperClass(M1ViterbiMapper.class); conf.setNumMapTasks(mapTasks); conf.setNumReduceTasks(0); conf.setInputFormat(SequenceFileInputFormat.class); FileInputFormat.setInputPaths(conf, new Path(bitext)); FileOutputFormat.setOutputPath(conf, new Path("somealigns.test")); RunningJob rj = JobClient.runJob(conf); Counters cs = rj.getCounters(); double lp = (double)cs.getCounter(CrossEntropyCounters.LOGPROB); double wc = (double)cs.getCounter(CrossEntropyCounters.WORDCOUNT); double ce = (lp / wc) / Math.log(2.0); System.out.println("Viterbi cross-entropy: " + ce + " perplexity: " + Math.pow(2.0, ce)); } }