package edu.umd.hooka; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStreamReader; import java.io.OutputStreamWriter; import edu.umd.hooka.corpora.Chunk; import edu.umd.hooka.corpora.Language; import edu.umd.hooka.corpora.LanguagePair; import edu.umd.hooka.corpora.ParallelChunk; import edu.umd.hooka.corpora.ParallelCorpusReader.PChunkCallback; public class CreateWordAlignmentCorpus { static class WriterCallback implements PChunkCallback { BufferedWriter ew; BufferedWriter fw; BufferedWriter lw; Language ar = Language.languageForISO639_1("ar"); Language en = Language.languageForISO639_1("en"); LanguagePair lp = LanguagePair.languageForISO639_1Pair("ar-en"); AlignmentWordPreprocessor sawp; AlignmentWordPreprocessor tawp; WriterCallback(String e, String f, String l) throws IOException { ew = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(e), "UTF8")); fw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(f), "UTF8")); lw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(l), "UTF8")); sawp = AlignmentWordPreprocessor.CreatePreprocessor(lp, ar, null); tawp = AlignmentWordPreprocessor.CreatePreprocessor(lp, en, null); } public void close() throws IOException { ew.flush(); ew.close(); fw.flush(); fw.close(); } static final int MAX_LENGTH = 99; public void handlePChunk(ParallelChunk p) { Chunk a = p.getChunk(ar); Chunk e = p.getChunk(en); if (a == null) return; if (e == null) return; String[] npa = a.getWords(); String[] npe = e.getWords(); if (npa.length > MAX_LENGTH) return; if (npe.length > MAX_LENGTH) return; if (npa.length == 0 || npe.length == 0) return; String[] aws = sawp.preprocessWordsForAlignment(npa); String[] ews = tawp.preprocessWordsForAlignment(npe); StringBuffer asb = new StringBuffer(); for (String i : aws) asb.append(i).append(' '); asb.deleteCharAt(asb.length() - 1); StringBuffer esb = new StringBuffer(); for (String i : ews) esb.append(i).append(' '); esb.deleteCharAt(esb.length() - 1); try { lw.write(p.getName()); fw.write(asb.toString()); ew.write(esb.toString()); lw.newLine(); fw.newLine(); ew.newLine(); } catch (IOException ex) { ex.printStackTrace(); System.exit(1); } } } public static void main(String[] args) { if (args.length != 3) { System.err.println("Usage: CreateWordAlignmentCorpus <lang> <infile.txt> <outfile.txt>"); System.err.println(" (note: lang must be a two-letter ISO639 code)"); System.exit(1); } try { BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(args[1]), "UTF8")); Language fl = Language.languageForISO639_1(args[0]); LanguagePair lp = LanguagePair.languageForISO639_1Pair(args[0]+"-en"); AlignmentWordPreprocessor sawp = AlignmentWordPreprocessor.CreatePreprocessor(lp, fl, null); String l; BufferedWriter out = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(args[2]), "UTF8")); while ((l =in.readLine()) != null) { String[] res = sawp.preprocessWordsForAlignment(l.split("\\s+")); boolean first = true; for (String r : res) { if (first) first = false; else out.write(' '); out.write(r); } out.newLine(); } out.flush(); out.close(); } catch (IOException ex) { ex.printStackTrace(); } } }