CorpusVocabNormalizerAndNumberizer.java example

Explorer
Cloud9-master
- src
package edu.umd.hooka;

import java.io.BufferedOutputStream;
import java.io.DataOutputStream;
import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.CompressionCodecFactory;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.hadoop.mapred.lib.IdentityReducer;
import org.apache.hadoop.streaming.StreamXmlRecordReader;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import edu.umd.hooka.alignment.aer.ReferenceAlignment;
import edu.umd.hooka.corpora.Chunk;
import edu.umd.hooka.corpora.Language;
import edu.umd.hooka.corpora.LanguagePair;
import edu.umd.hooka.corpora.ParallelChunk;
import edu.umd.hooka.corpora.ParallelCorpusReader;


public class CorpusVocabNormalizerAndNumberizer {
	static enum BitextCompilerCounters { EN_WORDS, FR_WORDS, CHUNKS, WRONG_LANGUAGE, SRC_TOO_LONG, TGT_TOO_LONG };
	private static final Logger sLogger = Logger.getLogger(CorpusVocabNormalizerAndNumberizer.class);

	static final String SRC_LANG = "ha.sourcelang";
	static final String TGT_LANG = "ha.targetlang";

	public static class BitextCompilerMapper extends MapReduceBase
	implements Mapper<Text, Text, Text, PhrasePair> {
		String outputBase = null;
		Path pf = null;
		Path pe = null;
		Path pa = null;
		static Vocab vocE = null;
		static Vocab vocF = null;
		ParallelCorpusReader pcr = new ParallelCorpusReader();
		Language src = null;
		Language tgt = null;
		AlignmentWordPreprocessor sawp = null;
		AlignmentWordPreprocessor tawp = null;
		LanguagePair lp = null;
		JobConf job_ = null;

		public void configure(JobConf job) {
			sLogger.setLevel(Level.OFF);
			src = Language.languageForISO639_1(job.get(SRC_LANG));
			tgt = Language.languageForISO639_1(job.get(TGT_LANG));
			sLogger.debug("Source language: "+src.code());
			sLogger.debug("Target language: "+tgt.code());
			
			boolean useVocabServer = false;
			if (!useVocabServer) {
				if (vocE == null) vocE = new VocabularyWritable();
				if (vocF == null) vocF = new VocabularyWritable();
			} else {
				try {
					vocE = new VocabServerClient(job.get("ha.vocabserver.host"),
							Integer.parseInt(job.get("ha.vocabserver.port1")));
					vocF = new VocabServerClient(job.get("ha.vocabserver.host"),
							Integer.parseInt(job.get("ha.vocabserver.port2")));
				} catch (IOException e) { e.printStackTrace(); throw new RuntimeException(e); }
			}
			lp = LanguagePair.languageForISO639_1Pair(
					src.code() + "-" + tgt.code());

			if(job.getBoolean("ha.trunc.use", true)){
				sawp = AlignmentWordPreprocessor.CreatePreprocessor(lp, src, job);
				tawp = AlignmentWordPreprocessor.CreatePreprocessor(lp, tgt, job);				
			}else{
				sawp = AlignmentWordPreprocessor.CreatePreprocessor(null, null, job);
				tawp = AlignmentWordPreprocessor.CreatePreprocessor(null, null, job);	
			}
			job_ = job;
		}

		public int[] convertStrings(String[] s, Vocab v) {
			int[] res = new int[s.length];
			for (int i =0; i<s.length; ++i) {
				res[i] = v.addOrGet(s[i]);
				sLogger.info(s[i]+"-->"+res[i]);
			}
			return res;
		}

		Text ok = new Text("");

		@Override
		public void close() {
			System.err.println("Target: " + vocE.size() + " types. Writing to "+job_.get("root",null)+"/vocab.E");
			System.err.println("Source: " + vocF.size() + " types .Writing to "+job_.get("root",null)+"/vocab.F");

			//write out vocabulary to file
			try {
				FileSystem fs = FileSystem.get(job_);

				DataOutputStream dos = new DataOutputStream(new BufferedOutputStream(fs.create(new Path(job_.get("root",null)+"/vocab.E"))));
				((VocabularyWritable) vocE).write(dos);
				dos.close();
				DataOutputStream dos2 = new DataOutputStream(new BufferedOutputStream(fs.create(new Path(job_.get("root",null)+"/vocab.F"))));
				((VocabularyWritable) vocF).write(dos2);
				dos2.close();
				
			} catch (IOException e) {
				throw new RuntimeException("Vocab couldn't be written to disk.\n"+e.toString());
			}
		}

		//read in xml-format bitext and output each instance as a PhrasePair object with a unique string id as key.

		public void map(Text key, Text value, 
				OutputCollector<Text, PhrasePair> oc, 
				Reporter reporter) throws IOException {

			//key: a single sentence in both languages and alignment
			//ignore value. each key is parallel sentence and its alignment, in xml format

			ParallelChunk c = pcr.parseString(key.toString());
			ok.set(c.idString());
			
			//Chunk is an array of tokens in the sentence, without any special tokenization (just separated by spaces)
			Chunk fc = c.getChunk(src);
			Chunk ec = c.getChunk(tgt);
			if (fc == null || ec == null) {
				reporter.incrCounter(BitextCompilerCounters.WRONG_LANGUAGE, 1);
				return;
			}
			if (fc.getLength() > 200) {
				reporter.incrCounter(BitextCompilerCounters.SRC_TOO_LONG, 1);
				return;
			}
			if (ec.getLength() > 200) {
				reporter.incrCounter(BitextCompilerCounters.TGT_TOO_LONG, 1);
				return;
			}

			//ec,fc: English/French sentence represented as sequence of words
			//vocE,vocF: vocabularies for english and french, of type VocabularyWritable

			//ee,fe: integer representation of words in sentences ec and fc
			sLogger.debug("Target sentence:");
			int[] ee = convertStrings(tawp.preprocessWordsForAlignment(ec.getWords()), vocE);
			sLogger.debug("Source sentence:");
			int[] fe = convertStrings(sawp.preprocessWordsForAlignment(fc.getWords()), vocF);

			//e,f: phrase from whole sentence
			Phrase e = new Phrase(ee, 0);
			Phrase f = new Phrase(fe, 1);

			edu.umd.hooka.PhrasePair b = new PhrasePair(f,e);
			ReferenceAlignment ra = c.getReferenceAlignment(lp);
			if (ra != null) {
				b.setAlignment(ra);
			}
			reporter.incrCounter(BitextCompilerCounters.EN_WORDS, e.getWords().length);
			reporter.incrCounter(BitextCompilerCounters.FR_WORDS, f.getWords().length);
			reporter.incrCounter(BitextCompilerCounters.CHUNKS, 1);
			oc.collect(ok, b);
		}
	}	

	public static class XMLInput extends FileInputFormat<Text, Text> {
		private CompressionCodecFactory compressionCodecs = null;

		public void configure(JobConf conf) {
			compressionCodecs = new CompressionCodecFactory(conf);
		}

		protected boolean isSplitable(FileSystem fs, Path file) {
			if (compressionCodecs == null) return true;
			return compressionCodecs.getCodec(file) == null;
		}

		public RecordReader<Text, Text> getRecordReader(
				InputSplit genericSplit, JobConf job,
				Reporter reporter)
				throws IOException {

			reporter.setStatus(genericSplit.toString());
			FileSplit split = (FileSplit)genericSplit;
			final Path file = split.getPath();
			FileSystem fs = file.getFileSystem(job);
			FSDataInputStream fileIn = fs.open(split.getPath());
			if (compressionCodecs != null && compressionCodecs.getCodec(file) != null)
				throw new RuntimeException("Not handling compression!");

			return new StreamXmlRecordReader(fileIn, split, reporter, job, FileSystem.get(job));
		}

	}

	@SuppressWarnings({ "deprecation", "null" })
	public static void preprocessAndNumberizeFiles(Configuration c,
			String inputPaths, Path output) throws IOException {
		sLogger.setLevel(Level.INFO);
		
		JobConf conf = new JobConf(c);

		conf.setJobName("bitext.compile");

		boolean useVocabServer = false;

		Thread vst1= null;
		Thread vst2= null;
		VocabServer vocabServer1 = null;
		VocabServer vocabServer2 = null;
		try {
			//inputPaths = bi-text given as input in main method of HadoopAlign
			conf.setOutputKeyClass(Text.class);
			conf.setOutputValueClass(PhrasePair.class);
			conf.setMapperClass(BitextCompilerMapper.class);
			conf.setReducerClass(IdentityReducer.class);
			conf.setNumMapTasks(1);
			conf.setNumReduceTasks(1);
			FileInputFormat.setInputPaths(conf, inputPaths);
			conf.set("stream.recordreader.begin", "<pchunk");
			conf.set("stream.recordreader.end", "</pchunk>");
			conf.set("stream.recordreader.slowmatch", "false");
			conf.set("stream.recordreader.maxrec", "100000");
			conf.setInputFormat(XMLInput.class);
			FileOutputFormat.setOutputPath(conf, output);
			conf.setOutputFormat(SequenceFileOutputFormat.class);
			conf.setJarByClass(CorpusVocabNormalizerAndNumberizer.class);
			System.out.println("Running job "+conf.getJobName());
			System.out.println("Input: " + inputPaths);
			System.out.println("Output: "+output);
			JobClient.runJob(conf);
		} finally {
			try {
				if (vst1!=null) vocabServer1.stopServer();
				if (vst2!=null) vocabServer2.stopServer();
				if (vst1!=null) vst1.join();
				if (vst2!=null) vst2.join();
			} catch (InterruptedException e) {}
		}
	}

	public static void main(String args[]) {
		Path[] files = new Path[2];
		files[0] = new Path("/Users/redpony/bitexts/man-align/deen.ccb_jhu.xml");
		files[1] = new Path("/tmp/bar.xml");
		try {
			Configuration c = new Configuration();
			c.set(SRC_LANG, "de");
			c.set(TGT_LANG, "en");
//			c.set("mapred.job.tracker", "local");
//			c.set("fs.default.name", "file:///");
//			FileSystem.get(c).delete(new Path("/Users/ferhanture/Documents/work/hadoop-0.20.1/dummy.out"), true);
//			preprocessAndNumberizeFiles(c, "/Users/ferhanture/edu/research/programs/hadoop-aligner/training-data.tar/eu-nc-wmt2008.de-en/eu-nc-wmt2008.de-en.xml", new Path("/Users/ferhanture/Documents/work/hadoop-0.20.1/dummy.out"));
			
			preprocessAndNumberizeFiles(c, "/umd-lin/fture/mt/eu-nc-wmt2008.de-en.xml", new Path("/umd-lin/fture/mt/aligner/comp-bitext"));

		} catch (Exception e) { e.printStackTrace(); }
	}
}