package edu.umd.hooka; import java.io.*; public class LocalBitextCompiler { /** * @param args */ public static void main(String[] args) { //Path pf = new Path(args[0]); //Path pe = new Path(args[1]); //String outputBase = args[3]; if (args.length != 2) { System.err.println("Usage: " + LocalBitextCompiler.class.getName() + " <inbitext> <output.base>"); System.exit(1); } String inf = args[0]; String out = args[1]; try { BufferedReader in = new BufferedReader( new InputStreamReader(new FileInputStream(inf), "UTF8")); DataOutputStream outf = new DataOutputStream(new BufferedOutputStream( new FileOutputStream(out))); VocabularyWritable vocE = new VocabularyWritable(); VocabularyWritable vocF = new VocabularyWritable(); String es; int lc = 0; while ((es = in.readLine()) != null) { lc++; if (lc % 1000 == 0) { System.err.print('.'); } if (lc % 50000 == 0) { System.err.println("[" + lc + "]"); System.gc(); } String[] fields = es.split("\\s*\\|\\|\\|\\s*"); try { Phrase e=Phrase.fromString(0, fields[0], vocE); Phrase f=Phrase.fromString(1, fields[1], vocF); Alignment a = new Alignment(f.size(), e.size(), fields[2]); PhrasePair alignedSentence = new PhrasePair(f,e,a); outf.writeInt(lc); alignedSentence.write(outf); } catch (Exception e) { System.err.println("\nAt line "+lc+" caught: "+e); } } outf.writeInt(-1); vocE.write(outf); vocF.write(outf); outf.close(); System.err.println("\n Sentences: " + lc); System.err.println(" E-voc: " + vocE.size() + " types"); System.err.println(" F-voc: " + vocF.size() + " types"); } catch (Exception e) { System.err.println(e); e.printStackTrace(); } } }