UploadAction.java example

Explorer
transmem-master
- src
  - com
    - transmem
  - tmx
    - LoadTmx.java
- test
  - db
    - DBTest.java
  - java
package com.transmem.action;

import java.util.logging.Logger;

import javax.servlet.ServletException;
import java.io.IOException;
import java.io.FileInputStream;
import java.io.OutputStream;

import java.sql.SQLException;
import java.sql.Connection;
import java.sql.ResultSet;
import java.sql.Statement;
//import java.sql.Blob;
import java.util.ArrayList;
import java.util.Map;

import com.transmem.data.db.Databases;
import com.transmem.data.db.Projects;
import com.transmem.data.db.Articles;
import com.transmem.data.db.Users;
import com.transmem.data.db.Roles;
import com.transmem.data.db.Corpora;
import com.transmem.data.db.CorpusTally;
import com.transmem.data.db.Sources;
import com.transmem.data.tm.Corpus;
import com.transmem.doc.IUnitSaver;
import com.transmem.doc.TmxLoader;
import com.transmem.doc.BitextLoader;
import com.transmem.nlp.ISegmenter;
import com.transmem.nlp.LanguageManager;
import com.transmem.nlp.LanguageException;

//import com.transmem.doc.FileParserFactory;

/**
 * Action class for uploading a pair of bilingual text files or a TMX file.
 * The sentences will be aligned and saved in the selected language corpus.
 * It returns to the edit page for editing the sentences.
 *
 * @version 0.1
 * @author Ted Wen
 * @date Jun. 2007
 */
public class UploadAction extends BaseAction implements IUnitSaver
{
	private Logger log_ = Logger.getLogger(UploadAction.class.getName());

	private	Session session_;
	private	Connection conn_;
	private	int uid_;
	private	String format_;
	private	String domain_;
	private	int permit_;
	private	int from_;
	private	boolean	failed_;
	private	String seqname_;
	private String srclang_;
	private String dstlang_;
	private ISegmenter sourceSegmenter_;
	private ISegmenter targetSegmenter_;
	private	ResultSet rs_, rs1_, rs2_;
	private int units_;
	private	String title_;

	public UploadAction()
	{
		super();
	}

	/**
	 * Execute action for saving bilingual sentences from an uploaded TMX file
	 * or two text files. The sentences may not be loaded as they are. If a sentence is too long
	 * (greater than 1000 characters) it may be broken into two or even more sentences. In this
	 * case, the translation sentence will be aligned to the separate sub-sentences. The alignment may be
	 * incorrect and need manual editing. After the sentences are loaded, it returns to the sentence
	 * compare page where the author can edit and correct the text sentence by sentence.
	 * 
	 * Request Parameters:
	 * filetype : either 'tmx' or 'bitext'
	 * bisfile, bidfile: when filetype is bitext, these are the source and target language files
	 * tmxfile: the filename for the tmx file
	 * 
	 */
	public void execute(ServletParams param) throws ServletException, IOException
	{
		log_.entering("UploadAction","execute");

		param.setContentType("text/html;charset=utf-8");
		
		this.session_ = param.getSession();

		Users usr = session_.getUser();
		if (usr == null)
		{
			param.sendError(MessageCode.ERR_NOT_LOGIN);
			return;
		}
		this.uid_ = usr.getUserID();

		this.failed_ = false;
		this.from_ = 0;
		this.units_ = 0;

		this.domain_ = "00";
		if (param.getParameter("domain")!=null)
			this.domain_ = param.getParameter("domain");

		String spermit = param.getParameter("permit");
		this.permit_ = 0;
		if (spermit==null || spermit.equals("P"))
			this.permit_ = 0;
		else if (spermit.equals("G"))
			this.permit_ = usr.getGroup();
		else
			this.permit_ = -1;	//private use

		this.srclang_ = param.getParameter("bislang");
		this.dstlang_ = param.getParameter("bidlang");
		
		this.title_ = "Untitled document";
		if (param.getParameter("filetitle")!=null)
		{
			this.title_ = param.getParameter("filetitle");
			try 
			{
				byte[] sb = this.title_.getBytes("ISO-8859-1");
				this.title_ = new String(sb, "UTF-8");
			} 
			catch (java.io.UnsupportedEncodingException e)
			{
            }
		}

		String filetype = param.getParameter("filetype");
		if (filetype.equals("tmx"))
		{
			this.format_ = "TMX";
			parseTmx(this.session_, param);
			setNextPage(PageLinks.MYCORPUS_PAGE);
		}
		else
		{
			this.format_ = "TXT";
			parseBitext(this.session_, param);
			setNextPage(PageLinks.MYSENTENCES_PAGE);
		}
	}

	/**
	 * Parse a TMX file to filter out the sentence pairs and save in the database.
	 * A valid TMX file should follow the standard, with the following essential tags:
	 * <?xml encoding="UTF-8"?>
	 * <tmx>
	 * <header>
	 * <prop type="Domain">the domain id such as IT</prop>
	 * <prop type="Source">(-1)what dictionary</prop>
	 * </head>
	 * <body>
	 * <tu tuid="xx">
	 * <tuv xml:lang="EN">
	 * <seg><![CDATA[sentence...]]></seg>
	 * </tuv>
	 * <tuv xml:lang="ZH">
	 * <seg>...</seg>
	 * </tuv>
	 * </tu>
	 * </body>
	 */
	protected void parseTmx(Session session, ServletParams param) throws ServletException, IOException
	{
		String tmxfilename = param.getFilePathName("tmxfile");
		if (tmxfilename == null) {
			java.util.Enumeration en = param.getFileNames();
			while (en.hasMoreElements()) {
				log_.info((String)en.nextElement());
			}
		}

		log_.info("About to parse file '"+tmxfilename+"'");
		//create a loader which will call this.saveUnit to save a sentence pair
		TmxLoader loader = new TmxLoader(tmxfilename, this);

		log_.info("TMX file '"+tmxfilename+"' finished.");
	}

	/**
	 * Parse two text files by aligning sentence by sentence.
	 */
	protected void parseBitext(Session session, ServletParams param) throws ServletException, IOException
	{
		String sfilename = param.getFilePathName("bisfile");
		String tfilename = param.getFilePathName("bidfile");
		String slang = param.getParameter("bislang");
		String tlang = param.getParameter("bidlang");
		String sencoding = param.getParameter("senc");
		String tencoding = param.getParameter("denc");

		log_.info(String.format("About to parse file '%s' and '%s'",sfilename,tfilename));
		//create a loader which will call this.saveUnit to save a sentence pair
		BitextLoader loader = new BitextLoader(sfilename, tfilename, sencoding, tencoding, this);
	}

	/**
	 * IUnitSaver interface method, called by Loader when parsing starts
	 */
	public void start()
	{
		this.conn_ = null;
		try
		{
			this.conn_ = getConnection(this.session_.getHttpSession(),Databases.CATEGORY_MAIN,true);
			if (this.conn_ == null) {
				log_.severe("getConnection(MAIN,true) return null");
			}
			this.conn_.setAutoCommit(false);
		}
		catch (IOException ioe)
		{
			log_.severe(ioe.toString());
			this.failed_ = true;
		}
		catch (ServletException se)
		{
			log_.severe(se.toString());
			this.failed_ = true;
		}
		catch (SQLException x)
		{
			log_.severe("SQLException when getConnection(USER)"+x.toString());
			this.failed_ = true;
			//param.sendError(MessageCode.ERR_DB_CONNECT);
			//return;
		}
	}

	/**
	 * IUnitSaver interface method, called by Loader when parsing ends
	 */
	public void end()
	{
		if (this.conn_ != null)
		{
			try
			{
				if (!this.failed_)
				{
					//save into t_sources
					log_.info("Saving source: aid="+this.from_+", title="+this.title_);
					Corpus.saveSource(this.conn_, this.from_, this.title_, this.format_, this.uid_, 
						this.srclang_, this.dstlang_, this.units_, this.domain_, this.permit_);
					//save t_corpora
					Corpus.saveCorpora(this.conn_, this.srclang_, this.dstlang_, this.domain_, this.permit_, this.units_);
					//done
					log_.info(this.units_+" units saved");
					//update variables in session
					updateSession(this.conn_, this.session_);
					//add points to the user if shared by the public
					if (this.permit_ == 0)
						addPoints(this.conn_, this.session_, this.units_);
					this.conn_.commit();
				}
				else
					this.conn_.rollback();
				this.conn_.close();
				this.conn_ = null;
			}
			catch (SQLException x)
			{
				try { this.conn_.rollback(); } catch (SQLException se) {}
				log_.warning("conn.close() exception");
			}
		}
	}

	/**
	 * IUnitSaver interface method, called by Loader when a property is ready
	 */
	public void setProperty(String key, String value)
	{
		if (key.equalsIgnoreCase("Domain"))
		{
			this.domain_ = value;
		}
		else if (key.equalsIgnoreCase("Source"))
		{
			if (value.indexOf('(')>=0)
			{
				value = value.substring(value.indexOf('(')+1,value.indexOf(')'));
			}
			try
			{
				this.from_ = Integer.parseInt(value);
			}
			catch (NumberFormatException nfe)
			{
				log_.warning("setPropery('Source',"+value+"), bad number");
			}
		}
		else if (key.equalsIgnoreCase("srclang"))	//valid for TMX
		{
			this.srclang_ = value;
			log_.info("Set source language to "+value);
		}
		else if (key.equalsIgnoreCase("dstlang"))
		{
			this.dstlang_ = value;
			log_.info("Set target language to "+value);
		}
	}

	/**
	 * Update session variables for examples tally and my list.
	 */
	private void updateSession(Connection conn, Session session) throws SQLException
	{
		Corpora c = new Corpora(conn);
		ArrayList<CorpusTally> cts = c.queryCorpusStats(this.srclang_, this.dstlang_);
		log_.info("queryCorpusStats return "+cts.size());
		session.setCorpusTally(cts);
		//load corpus tables into session
		ArrayList<String> corpusNames = c.queryCorpusTables();
		log_.info("Corpora.queryCorpusTables() return "+corpusNames.size()+" names");
		session.setCorpusNames(corpusNames);
		//load my sources
		Sources s = new Sources(conn);
		ArrayList<Sources> srcs = s.queryByOwner(this.uid_);
		log_.info(srcs.size()+" sources loaded for user "+this.uid_);
		session.setCorpusSourceList(srcs);
	}

	/**
	 * IUnitSaver interface method, called by Loader when a pair of sentences ready for saving
	 */
	public void saveUnit(String src, String dst) throws java.sql.SQLException
	{
		if (this.sourceSegmenter_==null || this.targetSegmenter_==null)
		{
			createDatasource();
		}
		long sid = getSequenceLong(this.conn_, this.seqname_);
		log_.info("About to save sentence sid="+sid);
		this.rs_.moveToInsertRow();
		this.rs_.updateLong("F_SID", sid);
		this.rs_.updateString("F_Source", src);
		this.rs_.updateString("F_Target", dst);
		this.rs_.updateString("F_Domain", this.domain_);
		if (this.from_ == 0)
			this.from_ = getSequenceInt(this.conn_, "S_Sources");
		this.rs_.updateInt("F_From", this.from_);
		this.rs_.updateInt("F_Permit", this.permit_);
		this.rs_.updateInt("F_Owner", this.uid_);
		this.rs_.insertRow();
		//log_.info("Sentence "+i+" saved, calling index()");
		try
		{
		Corpus.makeIndices(this.rs1_, this.sourceSegmenter_, sid, src);
		//log_.info("Index for sentence "+i+" finished");
		Corpus.makeIndices(this.rs2_, this.targetSegmenter_, sid, dst);
		}
		catch (LanguageException le)
		{
			log_.severe("Error at Corpus.makeIndices(): "+le.toString());
			throw new SQLException("LanguageException at Corpus.makeIndices():"+le.toString());
		}
		//
		this.units_ ++;
	}

	private void createDatasource() throws SQLException
	{
		try
		{
			//check availability of table, if not create it
			Corpora cs = new Corpora(this.conn_);
			if (cs.countTableByPair(this.srclang_, this.dstlang_) <= 0)
			{
				Corpus.createCorpusTables(this.srclang_, this.dstlang_, this.conn_);
				log_.info("Corpus tables for "+this.srclang_+"->"+this.dstlang_+" are created");
			}
			createSegmenters(this.session_, this.srclang_, this.dstlang_);
			log_.info("createSegmenters("+this.srclang_+","+this.dstlang_+") done");
			//get table name for the lang pair
			String tableName = makeCorpusTableName(this.srclang_, this.dstlang_);
			//delete sentences from corpus if already there
			this.seqname_ = "S"+tableName.substring(1);
		//	ArrayList<Long> sids = revokeSentences(conn, tableName, aid);
			//create resultsets for corpus, indexes
			createResultSets(this.conn_, this.srclang_, this.dstlang_, tableName);
			log_.info("createResultSets() finished");
		}
		catch (SQLException e)
		{
			log_.warning("saveUnit caused SQLException: "+e.toString());
			this.failed_ = true;
			throw e;
		}
		catch (LanguageException le)
		{
			log_.warning("createDatasource() caused LanguageException: "+le.toString());
			this.failed_ = true;
			throw new SQLException("LanguageException at createSegmenters:"+le.toString());
		}
	}

	/**
	 * Create segmenters based on the specified language codes.
	 * The ISegmenter objected are created and saved in the member variables: source_, target_.
	 * @param session - Session for the Http session
	 * @param scode - source language code like EN
	 * @param tcode - target language code like ZH
	 */
	private void createSegmenters(Session session, String scode, String tcode) throws LanguageException
	{
		javax.servlet.ServletContext app = session.getHttpSession().getServletContext();
		Map<String,String> codenames = (Map<String,String>)app.getAttribute("languages");
		String slang = codenames.get(scode);
		String tlang = codenames.get(tcode);
		this.sourceSegmenter_ = LanguageManager.createSegmenter(slang);
		this.targetSegmenter_ = LanguageManager.createSegmenter(tlang);
	}

	/**
	 * Create three resultset: corpus table, source index table, target index table.
	 * The source index table maps the source language in the corpus, not the sentence source.
	 * @param conn - Connection reference
	 * @param scode - source language code like EN
	 * @param tcode - target language code like ZH
	 */
	protected void createResultSets(Connection conn, String scode, String tcode, String tablename) throws SQLException
	{
		String index1 = String.format("%s_%sX",tablename,scode);
		String index2 = String.format("%s_%sX",tablename,tcode);

		String sql = "SELECT * FROM "+tablename+" WHERE F_SID = 0";
		Statement stmt = conn.createStatement(ResultSet.TYPE_SCROLL_INSENSITIVE,ResultSet.CONCUR_UPDATABLE);
		this.rs_ = stmt.executeQuery(sql);

		Statement stmt1 = conn.createStatement(ResultSet.TYPE_SCROLL_INSENSITIVE,ResultSet.CONCUR_UPDATABLE);
		this.rs1_ = stmt1.executeQuery("SELECT * FROM "+index1+" WHERE F_Word='x'");

		Statement stmt2 = conn.createStatement(ResultSet.TYPE_SCROLL_INSENSITIVE,ResultSet.CONCUR_UPDATABLE);
		this.rs2_ = stmt2.executeQuery("SELECT * FROM "+index2+" WHERE F_Word='x'");
	}

	/**
	 * Add points to the user who contributed example sentences to the public resevoir.
	 * The number of points is determined by the number of sentences times the rate of points per sentence.
	 * @param conn - Connection ref
	 * @param session - Session ref
	 * @param sents - number of sentences
	 */
	protected void addPoints(Connection conn, Session session, int sents) throws SQLException
	{
		int rate = session.getPointsPerShare().intValue();
		Users usr = session.getUser();
		usr.setConnection(conn);
		usr.clearUpdates();
		usr.setPoints(usr.getPoints() + rate * sents);
		usr.update();
	}

/*
	protected String getFileFormat(String filename)
	{
		String fileformat = "";
		if (filename.length() < 4)
		{
			return "";
		}
		fileformat = filename.substring(filename.lastIndexOf('.')+1).toUpperCase();
		if (fileformat.equalsIgnoreCase("html")) {
			fileformat = "HTM";
		} else if (fileformat.length() != 3) {
			return "";
		}
		return fileformat;
	}*/
}