package edu.umd.hooka; import java.io.BufferedOutputStream; import java.io.BufferedReader; import java.io.DataOutputStream; import java.io.IOException; import java.io.InputStreamReader; import java.io.ObjectOutputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.FileInputFormat; import org.apache.hadoop.mapred.FileOutputFormat; import org.apache.hadoop.mapred.JobClient; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.MapReduceBase; import org.apache.hadoop.mapred.Mapper; import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.Reporter; import org.apache.hadoop.mapred.SequenceFileOutputFormat; public class HBitextCompiler { static enum BitextCompilerCounters { EN_WORDS, FR_WORDS, LINES, ENCODING_ERRORS }; static final String OUTPUT_BASENAME = "bitextcomp.outputbasename"; static final String EN_PATH = "bitextcomp.enpath"; static final String FR_PATH = "bitextcomp.frpath"; static final String AL_PATH = "bitextcomp.alpath"; public static class BitextCompilerMapper extends MapReduceBase implements Mapper<LongWritable, Text, LongWritable, Text> { String outputBase = null; Path pf = null; Path pe = null; Path pa = null; public void configure(JobConf job) { outputBase = job.get((String)OUTPUT_BASENAME); pe = new Path(job.get((String)EN_PATH)); pf = new Path(job.get((String)FR_PATH)); String alps = job.get((String)AL_PATH); if (alps != null && alps.compareTo("") != 0) pa = new Path(alps); } public void map(LongWritable key, Text value, OutputCollector<LongWritable, Text> oc, Reporter reporter) throws IOException { Path output = new Path(outputBase); Path pmd = new Path(outputBase + ".metadata"); org.apache.hadoop.conf.Configuration conf = new org.apache.hadoop.conf.Configuration(); FileSystem fileSys = FileSystem.get(conf); VocabularyWritable vocE = new VocabularyWritable(); VocabularyWritable vocF = new VocabularyWritable(); org.apache.hadoop.io.SequenceFile.Writer sfw = SequenceFile.createWriter(fileSys, conf, output, IntWritable.class, PhrasePair.class); //if (true) throw new RuntimeException("here " + pe + " " + pf + " " + pa); boolean hasAlignment = (pa != null); BufferedReader rde = new BufferedReader(new InputStreamReader(fileSys.open(pe), "UTF8")); BufferedReader rdf = new BufferedReader(new InputStreamReader(fileSys.open(pf), "UTF8")); BufferedReader rda = null; if (hasAlignment) rda = new BufferedReader(new InputStreamReader(fileSys.open(pa), "UTF8")); String es; IntWritable lci = new IntWritable(0); int lc = 0; reporter.incrCounter(BitextCompilerCounters.ENCODING_ERRORS, 0); while ((es = rde.readLine()) != null) { lc++; if (lc % 100 == 0) reporter.progress(); reporter.incrCounter(BitextCompilerCounters.LINES, 1); String fs = rdf.readLine(); if (fs == null) { throw new RuntimeException(pf + " has fewer lines than " + pe); } try { Phrase e=Phrase.fromString(0, es, vocE); Phrase f=Phrase.fromString(1, fs, vocF); PhrasePair b = new PhrasePair(f,e); if (hasAlignment) { Alignment a = new Alignment(f.size(), e.size(), rda.readLine()); b.setAlignment(a); } lci.set(lc); sfw.append(lci, b); reporter.incrCounter(BitextCompilerCounters.EN_WORDS, e.getWords().length); reporter.incrCounter(BitextCompilerCounters.FR_WORDS, f.getWords().length); reporter.progress(); } catch (Exception e) { System.err.println("\nAt line "+lc+" caught: "+e); reporter.incrCounter(BitextCompilerCounters.ENCODING_ERRORS, 1); } } if (rdf.readLine() != null) { throw new RuntimeException(pf + " has more lines than " + pe); } sfw.close(); Path pve = new Path(outputBase + ".voc.e"); DataOutputStream dos = new DataOutputStream(new BufferedOutputStream(fileSys.create(pve))); vocE.write(dos); dos.close(); Path pvf = new Path(outputBase + ".voc.f"); dos = new DataOutputStream(new BufferedOutputStream(fileSys.create(pvf))); vocF.write(dos); dos.close(); Metadata theMetadata = new Metadata(lc, vocE.size(), vocF.size()); ObjectOutputStream mdstream = new ObjectOutputStream(new BufferedOutputStream(fileSys.create(pmd))); mdstream.writeObject(theMetadata); mdstream.close(); oc.collect(new LongWritable(0), new Text("done")); } } /** * @param args */ @SuppressWarnings("deprecation") public static void main(String[] args) { JobConf conf = new JobConf(HBitextCompiler.class); conf.set(OUTPUT_BASENAME, "/shared/bitexts/ep700k+nc.de-en/ep700k+nc"); conf.set(FR_PATH, "filt.lc.de"); conf.set(EN_PATH, "filt.lc.en"); conf.set(AL_PATH, ""); ///user/redpony/model-5M/aligned.grow-diag-final"); conf.setJobName("bitext.compile"); conf.setOutputKeyClass(LongWritable.class); conf.setOutputValueClass(Text.class); conf.setMapperClass(BitextCompilerMapper.class); conf.setNumMapTasks(1); conf.setNumReduceTasks(0); FileInputFormat.setInputPaths(conf, new Path("dummy")); try { FileSystem.get(conf).delete(new Path("dummy.out")); FileOutputFormat.setOutputPath(conf, new Path("dummy.out")); conf.setOutputFormat(SequenceFileOutputFormat.class); JobClient.runJob(conf); } catch (IOException e) { System.err.println("Caught " + e); e.printStackTrace(); } } }