package com.transmem.action; import java.util.logging.Logger; import javax.servlet.ServletException; import java.io.IOException; import java.io.FileInputStream; import java.io.OutputStream; import java.sql.SQLException; import java.sql.Connection; import java.sql.ResultSet; import java.sql.Statement; //import java.sql.Blob; import java.util.ArrayList; import java.util.Map; import com.transmem.data.db.Databases; import com.transmem.data.db.Projects; import com.transmem.data.db.Articles; import com.transmem.data.db.Users; import com.transmem.data.db.Roles; import com.transmem.data.db.Corpora; import com.transmem.data.db.CorpusTally; import com.transmem.data.db.Sources; import com.transmem.data.tm.Corpus; import com.transmem.doc.IUnitSaver; import com.transmem.doc.TmxLoader; import com.transmem.doc.BitextLoader; import com.transmem.nlp.ISegmenter; import com.transmem.nlp.LanguageManager; import com.transmem.nlp.LanguageException; //import com.transmem.doc.FileParserFactory; /** * Action class for uploading a pair of bilingual text files or a TMX file. * The sentences will be aligned and saved in the selected language corpus. * It returns to the edit page for editing the sentences. * * @version 0.1 * @author Ted Wen * @date Jun. 2007 */ public class UploadAction extends BaseAction implements IUnitSaver { private Logger log_ = Logger.getLogger(UploadAction.class.getName()); private Session session_; private Connection conn_; private int uid_; private String format_; private String domain_; private int permit_; private int from_; private boolean failed_; private String seqname_; private String srclang_; private String dstlang_; private ISegmenter sourceSegmenter_; private ISegmenter targetSegmenter_; private ResultSet rs_, rs1_, rs2_; private int units_; private String title_; public UploadAction() { super(); } /** * Execute action for saving bilingual sentences from an uploaded TMX file * or two text files. The sentences may not be loaded as they are. If a sentence is too long * (greater than 1000 characters) it may be broken into two or even more sentences. In this * case, the translation sentence will be aligned to the separate sub-sentences. The alignment may be * incorrect and need manual editing. After the sentences are loaded, it returns to the sentence * compare page where the author can edit and correct the text sentence by sentence. * * Request Parameters: * filetype : either 'tmx' or 'bitext' * bisfile, bidfile: when filetype is bitext, these are the source and target language files * tmxfile: the filename for the tmx file * */ public void execute(ServletParams param) throws ServletException, IOException { log_.entering("UploadAction","execute"); param.setContentType("text/html;charset=utf-8"); this.session_ = param.getSession(); Users usr = session_.getUser(); if (usr == null) { param.sendError(MessageCode.ERR_NOT_LOGIN); return; } this.uid_ = usr.getUserID(); this.failed_ = false; this.from_ = 0; this.units_ = 0; this.domain_ = "00"; if (param.getParameter("domain")!=null) this.domain_ = param.getParameter("domain"); String spermit = param.getParameter("permit"); this.permit_ = 0; if (spermit==null || spermit.equals("P")) this.permit_ = 0; else if (spermit.equals("G")) this.permit_ = usr.getGroup(); else this.permit_ = -1; //private use this.srclang_ = param.getParameter("bislang"); this.dstlang_ = param.getParameter("bidlang"); this.title_ = "Untitled document"; if (param.getParameter("filetitle")!=null) { this.title_ = param.getParameter("filetitle"); try { byte[] sb = this.title_.getBytes("ISO-8859-1"); this.title_ = new String(sb, "UTF-8"); } catch (java.io.UnsupportedEncodingException e) { } } String filetype = param.getParameter("filetype"); if (filetype.equals("tmx")) { this.format_ = "TMX"; parseTmx(this.session_, param); setNextPage(PageLinks.MYCORPUS_PAGE); } else { this.format_ = "TXT"; parseBitext(this.session_, param); setNextPage(PageLinks.MYSENTENCES_PAGE); } } /** * Parse a TMX file to filter out the sentence pairs and save in the database. * A valid TMX file should follow the standard, with the following essential tags: * <?xml encoding="UTF-8"?> * <tmx> * <header> * <prop type="Domain">the domain id such as IT</prop> * <prop type="Source">(-1)what dictionary</prop> * </head> * <body> * <tu tuid="xx"> * <tuv xml:lang="EN"> * <seg><![CDATA[sentence...]]></seg> * </tuv> * <tuv xml:lang="ZH"> * <seg>...</seg> * </tuv> * </tu> * </body> */ protected void parseTmx(Session session, ServletParams param) throws ServletException, IOException { String tmxfilename = param.getFilePathName("tmxfile"); if (tmxfilename == null) { java.util.Enumeration en = param.getFileNames(); while (en.hasMoreElements()) { log_.info((String)en.nextElement()); } } log_.info("About to parse file '"+tmxfilename+"'"); //create a loader which will call this.saveUnit to save a sentence pair TmxLoader loader = new TmxLoader(tmxfilename, this); log_.info("TMX file '"+tmxfilename+"' finished."); } /** * Parse two text files by aligning sentence by sentence. */ protected void parseBitext(Session session, ServletParams param) throws ServletException, IOException { String sfilename = param.getFilePathName("bisfile"); String tfilename = param.getFilePathName("bidfile"); String slang = param.getParameter("bislang"); String tlang = param.getParameter("bidlang"); String sencoding = param.getParameter("senc"); String tencoding = param.getParameter("denc"); log_.info(String.format("About to parse file '%s' and '%s'",sfilename,tfilename)); //create a loader which will call this.saveUnit to save a sentence pair BitextLoader loader = new BitextLoader(sfilename, tfilename, sencoding, tencoding, this); } /** * IUnitSaver interface method, called by Loader when parsing starts */ public void start() { this.conn_ = null; try { this.conn_ = getConnection(this.session_.getHttpSession(),Databases.CATEGORY_MAIN,true); if (this.conn_ == null) { log_.severe("getConnection(MAIN,true) return null"); } this.conn_.setAutoCommit(false); } catch (IOException ioe) { log_.severe(ioe.toString()); this.failed_ = true; } catch (ServletException se) { log_.severe(se.toString()); this.failed_ = true; } catch (SQLException x) { log_.severe("SQLException when getConnection(USER)"+x.toString()); this.failed_ = true; //param.sendError(MessageCode.ERR_DB_CONNECT); //return; } } /** * IUnitSaver interface method, called by Loader when parsing ends */ public void end() { if (this.conn_ != null) { try { if (!this.failed_) { //save into t_sources log_.info("Saving source: aid="+this.from_+", title="+this.title_); Corpus.saveSource(this.conn_, this.from_, this.title_, this.format_, this.uid_, this.srclang_, this.dstlang_, this.units_, this.domain_, this.permit_); //save t_corpora Corpus.saveCorpora(this.conn_, this.srclang_, this.dstlang_, this.domain_, this.permit_, this.units_); //done log_.info(this.units_+" units saved"); //update variables in session updateSession(this.conn_, this.session_); //add points to the user if shared by the public if (this.permit_ == 0) addPoints(this.conn_, this.session_, this.units_); this.conn_.commit(); } else this.conn_.rollback(); this.conn_.close(); this.conn_ = null; } catch (SQLException x) { try { this.conn_.rollback(); } catch (SQLException se) {} log_.warning("conn.close() exception"); } } } /** * IUnitSaver interface method, called by Loader when a property is ready */ public void setProperty(String key, String value) { if (key.equalsIgnoreCase("Domain")) { this.domain_ = value; } else if (key.equalsIgnoreCase("Source")) { if (value.indexOf('(')>=0) { value = value.substring(value.indexOf('(')+1,value.indexOf(')')); } try { this.from_ = Integer.parseInt(value); } catch (NumberFormatException nfe) { log_.warning("setPropery('Source',"+value+"), bad number"); } } else if (key.equalsIgnoreCase("srclang")) //valid for TMX { this.srclang_ = value; log_.info("Set source language to "+value); } else if (key.equalsIgnoreCase("dstlang")) { this.dstlang_ = value; log_.info("Set target language to "+value); } } /** * Update session variables for examples tally and my list. */ private void updateSession(Connection conn, Session session) throws SQLException { Corpora c = new Corpora(conn); ArrayList<CorpusTally> cts = c.queryCorpusStats(this.srclang_, this.dstlang_); log_.info("queryCorpusStats return "+cts.size()); session.setCorpusTally(cts); //load corpus tables into session ArrayList<String> corpusNames = c.queryCorpusTables(); log_.info("Corpora.queryCorpusTables() return "+corpusNames.size()+" names"); session.setCorpusNames(corpusNames); //load my sources Sources s = new Sources(conn); ArrayList<Sources> srcs = s.queryByOwner(this.uid_); log_.info(srcs.size()+" sources loaded for user "+this.uid_); session.setCorpusSourceList(srcs); } /** * IUnitSaver interface method, called by Loader when a pair of sentences ready for saving */ public void saveUnit(String src, String dst) throws java.sql.SQLException { if (this.sourceSegmenter_==null || this.targetSegmenter_==null) { createDatasource(); } long sid = getSequenceLong(this.conn_, this.seqname_); log_.info("About to save sentence sid="+sid); this.rs_.moveToInsertRow(); this.rs_.updateLong("F_SID", sid); this.rs_.updateString("F_Source", src); this.rs_.updateString("F_Target", dst); this.rs_.updateString("F_Domain", this.domain_); if (this.from_ == 0) this.from_ = getSequenceInt(this.conn_, "S_Sources"); this.rs_.updateInt("F_From", this.from_); this.rs_.updateInt("F_Permit", this.permit_); this.rs_.updateInt("F_Owner", this.uid_); this.rs_.insertRow(); //log_.info("Sentence "+i+" saved, calling index()"); try { Corpus.makeIndices(this.rs1_, this.sourceSegmenter_, sid, src); //log_.info("Index for sentence "+i+" finished"); Corpus.makeIndices(this.rs2_, this.targetSegmenter_, sid, dst); } catch (LanguageException le) { log_.severe("Error at Corpus.makeIndices(): "+le.toString()); throw new SQLException("LanguageException at Corpus.makeIndices():"+le.toString()); } // this.units_ ++; } private void createDatasource() throws SQLException { try { //check availability of table, if not create it Corpora cs = new Corpora(this.conn_); if (cs.countTableByPair(this.srclang_, this.dstlang_) <= 0) { Corpus.createCorpusTables(this.srclang_, this.dstlang_, this.conn_); log_.info("Corpus tables for "+this.srclang_+"->"+this.dstlang_+" are created"); } createSegmenters(this.session_, this.srclang_, this.dstlang_); log_.info("createSegmenters("+this.srclang_+","+this.dstlang_+") done"); //get table name for the lang pair String tableName = makeCorpusTableName(this.srclang_, this.dstlang_); //delete sentences from corpus if already there this.seqname_ = "S"+tableName.substring(1); // ArrayList<Long> sids = revokeSentences(conn, tableName, aid); //create resultsets for corpus, indexes createResultSets(this.conn_, this.srclang_, this.dstlang_, tableName); log_.info("createResultSets() finished"); } catch (SQLException e) { log_.warning("saveUnit caused SQLException: "+e.toString()); this.failed_ = true; throw e; } catch (LanguageException le) { log_.warning("createDatasource() caused LanguageException: "+le.toString()); this.failed_ = true; throw new SQLException("LanguageException at createSegmenters:"+le.toString()); } } /** * Create segmenters based on the specified language codes. * The ISegmenter objected are created and saved in the member variables: source_, target_. * @param session - Session for the Http session * @param scode - source language code like EN * @param tcode - target language code like ZH */ private void createSegmenters(Session session, String scode, String tcode) throws LanguageException { javax.servlet.ServletContext app = session.getHttpSession().getServletContext(); Map<String,String> codenames = (Map<String,String>)app.getAttribute("languages"); String slang = codenames.get(scode); String tlang = codenames.get(tcode); this.sourceSegmenter_ = LanguageManager.createSegmenter(slang); this.targetSegmenter_ = LanguageManager.createSegmenter(tlang); } /** * Create three resultset: corpus table, source index table, target index table. * The source index table maps the source language in the corpus, not the sentence source. * @param conn - Connection reference * @param scode - source language code like EN * @param tcode - target language code like ZH */ protected void createResultSets(Connection conn, String scode, String tcode, String tablename) throws SQLException { String index1 = String.format("%s_%sX",tablename,scode); String index2 = String.format("%s_%sX",tablename,tcode); String sql = "SELECT * FROM "+tablename+" WHERE F_SID = 0"; Statement stmt = conn.createStatement(ResultSet.TYPE_SCROLL_INSENSITIVE,ResultSet.CONCUR_UPDATABLE); this.rs_ = stmt.executeQuery(sql); Statement stmt1 = conn.createStatement(ResultSet.TYPE_SCROLL_INSENSITIVE,ResultSet.CONCUR_UPDATABLE); this.rs1_ = stmt1.executeQuery("SELECT * FROM "+index1+" WHERE F_Word='x'"); Statement stmt2 = conn.createStatement(ResultSet.TYPE_SCROLL_INSENSITIVE,ResultSet.CONCUR_UPDATABLE); this.rs2_ = stmt2.executeQuery("SELECT * FROM "+index2+" WHERE F_Word='x'"); } /** * Add points to the user who contributed example sentences to the public resevoir. * The number of points is determined by the number of sentences times the rate of points per sentence. * @param conn - Connection ref * @param session - Session ref * @param sents - number of sentences */ protected void addPoints(Connection conn, Session session, int sents) throws SQLException { int rate = session.getPointsPerShare().intValue(); Users usr = session.getUser(); usr.setConnection(conn); usr.clearUpdates(); usr.setPoints(usr.getPoints() + rate * sents); usr.update(); } /* protected String getFileFormat(String filename) { String fileformat = ""; if (filename.length() < 4) { return ""; } fileformat = filename.substring(filename.lastIndexOf('.')+1).toUpperCase(); if (fileformat.equalsIgnoreCase("html")) { fileformat = "HTM"; } else if (fileformat.length() != 3) { return ""; } return fileformat; }*/ }