M1ViterbiExtract.java example

Explorer
Cloud9-master
- src
package edu.umd.hooka.alignment;

import edu.umd.hooka.alignment.model1.Model1;
import edu.umd.hooka.ttables.TTable;
import edu.umd.hooka.ttables.TTable_monolithic_IFAs;

import java.io.BufferedInputStream;
import java.io.DataInput;
import java.io.DataInputStream;
import java.io.IOException;

import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.Counters;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.RunningJob;
import org.apache.hadoop.mapred.SequenceFileInputFormat;

import edu.umd.hooka.Alignment;
import edu.umd.hooka.PhrasePair;

/**
 * Reads a bitext and generates a TTable object (serialized) based on the
 * (e,f) cooccurrences in the text.
 * 
 * @author redpony
 *
 */
public class M1ViterbiExtract {
	
	//static final String bitext  ="/shared/bitexts/small.ar-en.ldc/ar-en.bitext";
	//static final String ttable  ="/user/redpony/small.ar-en.ttable";

	static final String bitext  ="/shared/bitexts/hansards.fr-en/hansards.aachen.bitext";
	static final String ttable  ="/user/redpony/hansards.aachen.ttable";

	static protected TTable loadTTable(Path path) throws IOException {
		org.apache.hadoop.conf.Configuration conf = new org.apache.hadoop.conf.Configuration();
		FileSystem fileSys = FileSystem.get(conf);
	
		DataInput in = new DataInputStream(new BufferedInputStream(fileSys.open(path)));
		TTable_monolithic_IFAs tt = new TTable_monolithic_IFAs();
		tt.readFields(in);
		
		return tt;
	}
	
	public static class M1ViterbiMapper extends MapReduceBase
	  implements Mapper<IntWritable,PhrasePair,IntWritable,Text> {
		
		Text out = new Text();
		PerplexityReporter cr = new PerplexityReporter();
		Model1 m1 = null;
		public void map(IntWritable key, PhrasePair value, 
		                    OutputCollector<IntWritable,Text> output, 
		                    Reporter reporter) throws IOException {
			if (m1 == null) {
				Path pathTTable = new Path(ttable);
				TTable tt = loadTTable(pathTTable);
				m1 = new Model1(tt, true);
			}
			cr.reset();
			Alignment a = m1.viterbiAlign(value, cr);
			out.set(a.toString());
			output.collect(key, out);
			reporter.incrCounter(CrossEntropyCounters.LOGPROB, (long)(cr.getTotalLogProb()));
			reporter.incrCounter(CrossEntropyCounters.WORDCOUNT, cr.getTotalWordCount());
		}
		
		public void close() {
		}
	}
		
	@SuppressWarnings("deprecation")
	public static void main(String[] args) throws IOException {
		int mapTasks    = 15;
		
		JobConf conf = new JobConf(M1ViterbiMapper.class);
		conf.setJobName("m1viterbi");
		conf.setOutputKeyClass(LongWritable.class);
		conf.setOutputValueClass(Text.class);
		conf.setMapperClass(M1ViterbiMapper.class);		        
		conf.setNumMapTasks(mapTasks);
		conf.setNumReduceTasks(0);
		conf.setInputFormat(SequenceFileInputFormat.class);
		FileInputFormat.setInputPaths(conf, new Path(bitext));
		FileOutputFormat.setOutputPath(conf, new Path("somealigns.test"));

		RunningJob rj = JobClient.runJob(conf);
		Counters cs = rj.getCounters();
		double lp = (double)cs.getCounter(CrossEntropyCounters.LOGPROB);
		double wc = (double)cs.getCounter(CrossEntropyCounters.WORDCOUNT);
		double ce = (lp / wc) / Math.log(2.0);
		System.out.println("Viterbi cross-entropy: " + ce + "   perplexity: " + Math.pow(2.0, ce));
	}
	
}