package edu.umd.hooka; import java.io.BufferedOutputStream; import java.io.DataOutputStream; import java.io.IOException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.compress.CompressionCodecFactory; import org.apache.hadoop.mapred.FileInputFormat; import org.apache.hadoop.mapred.FileOutputFormat; import org.apache.hadoop.mapred.FileSplit; import org.apache.hadoop.mapred.InputSplit; import org.apache.hadoop.mapred.JobClient; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.MapReduceBase; import org.apache.hadoop.mapred.Mapper; import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.RecordReader; import org.apache.hadoop.mapred.Reporter; import org.apache.hadoop.mapred.SequenceFileOutputFormat; import org.apache.hadoop.mapred.lib.IdentityReducer; import org.apache.hadoop.streaming.StreamXmlRecordReader; import org.apache.log4j.Level; import org.apache.log4j.Logger; import edu.umd.hooka.alignment.aer.ReferenceAlignment; import edu.umd.hooka.corpora.Chunk; import edu.umd.hooka.corpora.Language; import edu.umd.hooka.corpora.LanguagePair; import edu.umd.hooka.corpora.ParallelChunk; import edu.umd.hooka.corpora.ParallelCorpusReader; public class CorpusVocabNormalizerAndNumberizer { static enum BitextCompilerCounters { EN_WORDS, FR_WORDS, CHUNKS, WRONG_LANGUAGE, SRC_TOO_LONG, TGT_TOO_LONG }; private static final Logger sLogger = Logger.getLogger(CorpusVocabNormalizerAndNumberizer.class); static final String SRC_LANG = "ha.sourcelang"; static final String TGT_LANG = "ha.targetlang"; public static class BitextCompilerMapper extends MapReduceBase implements Mapper<Text, Text, Text, PhrasePair> { String outputBase = null; Path pf = null; Path pe = null; Path pa = null; static Vocab vocE = null; static Vocab vocF = null; ParallelCorpusReader pcr = new ParallelCorpusReader(); Language src = null; Language tgt = null; AlignmentWordPreprocessor sawp = null; AlignmentWordPreprocessor tawp = null; LanguagePair lp = null; JobConf job_ = null; public void configure(JobConf job) { sLogger.setLevel(Level.OFF); src = Language.languageForISO639_1(job.get(SRC_LANG)); tgt = Language.languageForISO639_1(job.get(TGT_LANG)); sLogger.debug("Source language: "+src.code()); sLogger.debug("Target language: "+tgt.code()); boolean useVocabServer = false; if (!useVocabServer) { if (vocE == null) vocE = new VocabularyWritable(); if (vocF == null) vocF = new VocabularyWritable(); } else { try { vocE = new VocabServerClient(job.get("ha.vocabserver.host"), Integer.parseInt(job.get("ha.vocabserver.port1"))); vocF = new VocabServerClient(job.get("ha.vocabserver.host"), Integer.parseInt(job.get("ha.vocabserver.port2"))); } catch (IOException e) { e.printStackTrace(); throw new RuntimeException(e); } } lp = LanguagePair.languageForISO639_1Pair( src.code() + "-" + tgt.code()); if(job.getBoolean("ha.trunc.use", true)){ sawp = AlignmentWordPreprocessor.CreatePreprocessor(lp, src, job); tawp = AlignmentWordPreprocessor.CreatePreprocessor(lp, tgt, job); }else{ sawp = AlignmentWordPreprocessor.CreatePreprocessor(null, null, job); tawp = AlignmentWordPreprocessor.CreatePreprocessor(null, null, job); } job_ = job; } public int[] convertStrings(String[] s, Vocab v) { int[] res = new int[s.length]; for (int i =0; i<s.length; ++i) { res[i] = v.addOrGet(s[i]); sLogger.info(s[i]+"-->"+res[i]); } return res; } Text ok = new Text(""); @Override public void close() { System.err.println("Target: " + vocE.size() + " types. Writing to "+job_.get("root",null)+"/vocab.E"); System.err.println("Source: " + vocF.size() + " types .Writing to "+job_.get("root",null)+"/vocab.F"); //write out vocabulary to file try { FileSystem fs = FileSystem.get(job_); DataOutputStream dos = new DataOutputStream(new BufferedOutputStream(fs.create(new Path(job_.get("root",null)+"/vocab.E")))); ((VocabularyWritable) vocE).write(dos); dos.close(); DataOutputStream dos2 = new DataOutputStream(new BufferedOutputStream(fs.create(new Path(job_.get("root",null)+"/vocab.F")))); ((VocabularyWritable) vocF).write(dos2); dos2.close(); } catch (IOException e) { throw new RuntimeException("Vocab couldn't be written to disk.\n"+e.toString()); } } //read in xml-format bitext and output each instance as a PhrasePair object with a unique string id as key. public void map(Text key, Text value, OutputCollector<Text, PhrasePair> oc, Reporter reporter) throws IOException { //key: a single sentence in both languages and alignment //ignore value. each key is parallel sentence and its alignment, in xml format ParallelChunk c = pcr.parseString(key.toString()); ok.set(c.idString()); //Chunk is an array of tokens in the sentence, without any special tokenization (just separated by spaces) Chunk fc = c.getChunk(src); Chunk ec = c.getChunk(tgt); if (fc == null || ec == null) { reporter.incrCounter(BitextCompilerCounters.WRONG_LANGUAGE, 1); return; } if (fc.getLength() > 200) { reporter.incrCounter(BitextCompilerCounters.SRC_TOO_LONG, 1); return; } if (ec.getLength() > 200) { reporter.incrCounter(BitextCompilerCounters.TGT_TOO_LONG, 1); return; } //ec,fc: English/French sentence represented as sequence of words //vocE,vocF: vocabularies for english and french, of type VocabularyWritable //ee,fe: integer representation of words in sentences ec and fc sLogger.debug("Target sentence:"); int[] ee = convertStrings(tawp.preprocessWordsForAlignment(ec.getWords()), vocE); sLogger.debug("Source sentence:"); int[] fe = convertStrings(sawp.preprocessWordsForAlignment(fc.getWords()), vocF); //e,f: phrase from whole sentence Phrase e = new Phrase(ee, 0); Phrase f = new Phrase(fe, 1); edu.umd.hooka.PhrasePair b = new PhrasePair(f,e); ReferenceAlignment ra = c.getReferenceAlignment(lp); if (ra != null) { b.setAlignment(ra); } reporter.incrCounter(BitextCompilerCounters.EN_WORDS, e.getWords().length); reporter.incrCounter(BitextCompilerCounters.FR_WORDS, f.getWords().length); reporter.incrCounter(BitextCompilerCounters.CHUNKS, 1); oc.collect(ok, b); } } public static class XMLInput extends FileInputFormat<Text, Text> { private CompressionCodecFactory compressionCodecs = null; public void configure(JobConf conf) { compressionCodecs = new CompressionCodecFactory(conf); } protected boolean isSplitable(FileSystem fs, Path file) { if (compressionCodecs == null) return true; return compressionCodecs.getCodec(file) == null; } public RecordReader<Text, Text> getRecordReader( InputSplit genericSplit, JobConf job, Reporter reporter) throws IOException { reporter.setStatus(genericSplit.toString()); FileSplit split = (FileSplit)genericSplit; final Path file = split.getPath(); FileSystem fs = file.getFileSystem(job); FSDataInputStream fileIn = fs.open(split.getPath()); if (compressionCodecs != null && compressionCodecs.getCodec(file) != null) throw new RuntimeException("Not handling compression!"); return new StreamXmlRecordReader(fileIn, split, reporter, job, FileSystem.get(job)); } } @SuppressWarnings({ "deprecation", "null" }) public static void preprocessAndNumberizeFiles(Configuration c, String inputPaths, Path output) throws IOException { sLogger.setLevel(Level.INFO); JobConf conf = new JobConf(c); conf.setJobName("bitext.compile"); boolean useVocabServer = false; Thread vst1= null; Thread vst2= null; VocabServer vocabServer1 = null; VocabServer vocabServer2 = null; try { //inputPaths = bi-text given as input in main method of HadoopAlign conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(PhrasePair.class); conf.setMapperClass(BitextCompilerMapper.class); conf.setReducerClass(IdentityReducer.class); conf.setNumMapTasks(1); conf.setNumReduceTasks(1); FileInputFormat.setInputPaths(conf, inputPaths); conf.set("stream.recordreader.begin", "<pchunk"); conf.set("stream.recordreader.end", "</pchunk>"); conf.set("stream.recordreader.slowmatch", "false"); conf.set("stream.recordreader.maxrec", "100000"); conf.setInputFormat(XMLInput.class); FileOutputFormat.setOutputPath(conf, output); conf.setOutputFormat(SequenceFileOutputFormat.class); conf.setJarByClass(CorpusVocabNormalizerAndNumberizer.class); System.out.println("Running job "+conf.getJobName()); System.out.println("Input: " + inputPaths); System.out.println("Output: "+output); JobClient.runJob(conf); } finally { try { if (vst1!=null) vocabServer1.stopServer(); if (vst2!=null) vocabServer2.stopServer(); if (vst1!=null) vst1.join(); if (vst2!=null) vst2.join(); } catch (InterruptedException e) {} } } public static void main(String args[]) { Path[] files = new Path[2]; files[0] = new Path("/Users/redpony/bitexts/man-align/deen.ccb_jhu.xml"); files[1] = new Path("/tmp/bar.xml"); try { Configuration c = new Configuration(); c.set(SRC_LANG, "de"); c.set(TGT_LANG, "en"); // c.set("mapred.job.tracker", "local"); // c.set("fs.default.name", "file:///"); // FileSystem.get(c).delete(new Path("/Users/ferhanture/Documents/work/hadoop-0.20.1/dummy.out"), true); // preprocessAndNumberizeFiles(c, "/Users/ferhanture/edu/research/programs/hadoop-aligner/training-data.tar/eu-nc-wmt2008.de-en/eu-nc-wmt2008.de-en.xml", new Path("/Users/ferhanture/Documents/work/hadoop-0.20.1/dummy.out")); preprocessAndNumberizeFiles(c, "/umd-lin/fture/mt/eu-nc-wmt2008.de-en.xml", new Path("/umd-lin/fture/mt/aligner/comp-bitext")); } catch (Exception e) { e.printStackTrace(); } } }