package com.transmem.doc; import java.io.IOException; import java.io.File; import java.io.PrintWriter; import java.io.FileReader; import java.io.BufferedReader; import java.util.ArrayList; import java.util.logging.Logger; import java.sql.SQLException; /** * Bitext loader loads a bilingual parallel text from two separate files. * The files should be in plain text format with proper punctuation marks * for each sentence. In other words, a sentence in the source language * text should have a corresponding sentence in the translation text. * The current implementation does not include automatic alignment, it is * the user's responsibility to guarantee that the sentences are aligned. * This class implements ITextSaver for TextParser to parse the text into * sentences. */ public class BitextLoader implements ITextSaver { public static final Logger log_ = Logger.getLogger(BitextLoader.class.getName()); class Record { public int index_; public int length_; public Record link_; //link to the other language sentence public Record(int index, int length) { this.index_ = index; this.length_ = length; this.link_ = null; } } private IUnitSaver saver_ = null; private String sencoding_, tencoding_; private File[] tempfiles_; //temporary filenames private PrintWriter[] tempwriters_; private ArrayList<Record>[] slens_; //length of sentences private int curfile_, index_; /** * Construct BitextLoader with two file names. */ public BitextLoader(String sfilename, String tfilename, String sencoding, String tencoding, IUnitSaver saver) throws IOException,SQLException { this.saver_ = saver; this.sencoding_ = sencoding; this.tencoding_ = tencoding; parseFiles(sfilename, tfilename); } /** * Parse the two files to get sentences. * @param sfilename - source text filename * @param tfilename - target text filename */ protected void parseFiles(String sfilename, String tfilename) throws IOException,SQLException { this.slens_ = new ArrayList[2];//{new ArrayList<Record>(),new ArrayList<Record>()}; this.slens_[0] = new ArrayList<Record>(); this.slens_[1] = new ArrayList<Record>(); //parse two files one by one, saving sentences into temporary text files this.tempfiles_ = new File[2]; this.tempfiles_[0] = File.createTempFile("tmbis",".tmp"); log_.info("Temporary file created 4 src: "+tempfiles_[0]); this.tempwriters_[0] = new PrintWriter(this.tempfiles_[0]); this.curfile_ = 0; this.index_ = 0; TextParser tps = new TextParser(); tps.parse(sfilename, this.sencoding_, this); this.tempwriters_[0].close(); this.tempfiles_[1] = File.createTempFile("tmbid",".tmp"); log_.info("Temporary file created 4 dst: "+tempfiles_[1]); this.tempwriters_[1] = new PrintWriter(this.tempfiles_[1]); this.curfile_ = 1; this.index_ = 0; TextParser tpd = new TextParser(); tpd.parse(tfilename, this.tencoding_, this); this.tempwriters_[1].close(); //merge and align the sentences boolean saved = false; if ((this.slens_[0].size() > 0) && (this.slens_[0].size() == this.slens_[1].size())) { mergeAndAlign(); saved = true; } //finally delete the temp files this.tempfiles_[0].delete(); this.tempfiles_[1].delete(); log_.info("Temporary files deleted"); File f = new File(sfilename); f.delete(); f = new File(tfilename); f.delete(); log_.info("Uploaded files deleted"); if (!saved) throw new IOException("Sentences do not match"); } /** * Align the sentences from the two files and merge if necessary. * A true aligner is an AI research topic, so we ignore that at the moment, * and leave the un-matched sentences empty. The user is then responsible * to edit the sentences and align manually. */ public void mergeAndAlign() throws SQLException,IOException { BufferedReader reader1 = new BufferedReader(new FileReader(this.tempfiles_[0])); BufferedReader reader2 = new BufferedReader(new FileReader(this.tempfiles_[1])); while (true) { String s1 = reader1.readLine(); String s2 = reader2.readLine(); if (s1 == null && s2 == null) break; this.saver_.saveUnit(s1, s2); } reader1.close(); reader2.close(); } // Interface method, not used here public void startParagraph(int startpos) throws java.sql.SQLException { } // Interface method, not used here public void endParagraph(int endpos) throws java.sql.SQLException { } /** * Called by a TextParser object when a sentence is ready to save. * @param sentence - String as a sentence * @param startpos - ignored * @param endpos - ignored */ public void saveSentence(String sentence, int startpos, int endpos) throws java.sql.SQLException { this.tempwriters_[this.curfile_].println(sentence); this.slens_[this.curfile_].add(new Record(this.index_, sentence.length())); this.index_ ++; } }