package com.transmem.data.tm;
import com.transmem.nlp.*;
import com.transmem.data.db.*;
import com.transmem.data.db.Databases;
//import com.transmem.nlp.LanguageException;
import java.util.ArrayList;
import java.util.logging.Logger;
import java.sql.Connection;
import java.sql.SQLException;
import java.sql.ResultSet;
import java.sql.Statement;
import java.sql.PreparedStatement;
/**
* Corpus
*
* The Corpus class can be instantiated for each session to query the sentence
* database for similar sentences and phrases.
* The query method accepts a sentence of source language and queries a list of
* similar sentences for it and presents the translations.
* The corpus is determined by the constructor according to the language argument.
*
* @author Ted Wen
* @update April, 2007
*
*/
public class Corpus
{
private static final Logger log_ = Logger.getLogger(Corpus.class.getName());
private String source_;
private String target_;
private boolean swap_; //if source is ZH, target is EN, tablename is ENZH, then swap is true
private String corpusName_;
private String indexName_;
private ISegmenter segmenter_;
private IStemmer stemmer_;
private Databases dbs_;
private int maxSids_ = 500; //maximum number of example sentences allowed
private ArrayList<Long> sids_;
private int pageLen_ = 10; //number of sentences per page to return
/**
* Construct a Corpus instance with a source and target language code.
* @param dbs - Databases object
* @param langpair - 4-letter language pair like ENZH
*/
public Corpus(Databases dbs, String langpair)
{
assert(dbs != null);
this.dbs_ = dbs;
this.source_ = langpair.substring(0,2);
this.target_ = langpair.substring(2);
try
{
this.corpusName_ = queryTablename(langpair);
if (this.corpusName_.indexOf(langpair)<0)
{
this.swap_ = true;
this.indexName_ = "T_"+langpair+"x";
}
else
this.indexName_ = this.corpusName_ + "x";
this.segmenter_ = LanguageManager.createSegmenter(source_);
this.stemmer_ = LanguageManager.createStemmer(source_);
}
catch (LanguageException le)
{
log_.severe("LanguageManager.createXXX("+source_+") failed: "+le.getMessage());
}
}
/**
* Construct a Corpus instance with a source and target language code.
* @param dbs - Databases object
* @param source - 2-letter language code like EN, ZH
* @param target - 2-letter language code like EN, ZH
*/
public Corpus(Databases dbs, String source, String target)
{
assert(dbs != null);
this.dbs_ = dbs;
this.source_ = source;
this.target_ = target;
try
{
this.corpusName_ = queryTablename(source,target);
if (this.corpusName_.indexOf(source)>2)
{
this.swap_ = true;
this.indexName_ = "T_"+target+source+"x";
}
else
this.indexName_ = this.corpusName_ + "x";
this.segmenter_ = LanguageManager.createSegmenter(source);
this.stemmer_ = LanguageManager.createStemmer(source);
}
catch (LanguageException le)
{
log_.severe("LanguageManager.createXXX("+source+") failed: "+le.getMessage());
}
}
public int getMaximumSids()
{
return this.maxSids_;
}
public void setMaximumSids(int maxsids)
{
this.maxSids_ = maxsids;
}
public int getPageLen()
{
return this.pageLen_;
}
public void setPageLen(int len)
{
this.pageLen_ = len;
}
public String getTableName()
{
return this.corpusName_;
}
public String getIndexName()
{
return this.indexName_;
}
/**
* Query tablename from languages table for given langpair 4 letters.
* @param langpair - like ENZH
* @return String of table name for the language pair
*/
protected String queryTablename(String langpair) throws LanguageException
{
if (this.dbs_ == null)
throw new LanguageException("Connection not given");
Connection con = null;
try
{
con = this.dbs_.getConnection(Databases.CATEGORY_CORPUS);
Languages lang = new Languages(con);
return lang.queryTableName(langpair);
}
catch (SQLException se)
{
throw new LanguageException("SQLException occurred while query T_Languages("+langpair+")");
}
finally
{
if (con != null)
{
try
{
con.close();
}
catch (SQLException se)
{
}
}
}
}
/**
* Query tablename from languages table for given source and target 2-letter language codes.
* @param scode - source language code, 2-letter code for F_SourceCode
* @param tcode - 2-letter language code for F_TargetCode
* @return String of table name for the language pair
*/
protected String queryTablename(String scode, String tcode) throws LanguageException
{
if (this.dbs_ == null)
throw new LanguageException("Connection not given");
Connection con = null;
try
{
con = this.dbs_.getConnection(Databases.CATEGORY_CORPUS);
Languages lang = new Languages(con);
return lang.queryTableNameBySourceTarget(scode, tcode);
}
catch (SQLException se)
{
throw new LanguageException("SQLException occurred while query T_Languages("+scode+","+tcode+")");
}
finally
{
if (con != null)
{
try
{
con.close();
}
catch (SQLException se)
{
}
}
}
}
/**
* Query example source and target language sentences from the corpus for a given sentence in source language.
* The method returns the number of sentences found in the corpus.
* About domain: If the specified domain is 00, then every domain is searched, otherwise, only a specified domain.
* About permit: If unspecified, only Public sentences are searched, if Group is required, then
* the G sentences with owner is in the group are visited first, then union with Public ones; if Owner is required, then
* the O sentences with owner is the user are visited first, then public ones.
*
* @param userid - user's ID
* @param group - user's group ID
* @param s - sentence in source language
* @param domain - industrial domain tag like 'IT', '00' means all domains
* @param permit - permit tag like 'P','G','O'
* @return number of pages
*/
public int query(int userid, int group, String s, String domain, String permit) throws LanguageException
{
if (this.segmenter_ == null || this.stemmer_ == null)
{
throw new LanguageException("Segmenter and/or Stemmer not created, possibly language pair not supported");
}
//1. segment the sentence into words
String[] words = segmenter_.segment(s);
//2. if stemmer applies, then find the stems for these tokens
String[] stems = new String[words.length];
boolean b = stemmer_.applies();
for (int i=0; i<words.length; i++)
{
stems[i] = b ? stemmer_.stem(words[i]) : words[i];
}
//3. prepare SQL query to get SID
//select sid,count(sid) as c from t_enzh where f_word in ('','') group by sid order by c desc
String sql = makeQueryStatement(userid, group, indexName_, corpusName_, stems, domain, permit);
//4. query on connection
log_.info("Corpus.query, SQL:"+sql);
this.sids_ = new ArrayList<Long>();
Connection con = null;
try
{
con = this.dbs_.getConnection(Databases.CATEGORY_CORPUS);
Statement stmt = con.createStatement();
ResultSet rs = stmt.executeQuery(sql);
int n = 0;
//TODO: compute distance to s and rank these sentences accordingly
while (rs.next())
{
long sid = rs.getLong(1);
this.sids_.add(new Long(sid));
//log_.info("SID="+sid+",count="+rs.getInt(2));
if (n++ > this.maxSids_) break;
}
rs.close();
stmt.close();
log_.info(n+" SIDs found for this query");
//int pages = n / this.pageLen_;
//if (n % this.pageLen_ > 0) pages ++;
return n / this.pageLen_;
}
catch (SQLException se)
{
throw new LanguageException("SQLException occurred: "+se.getMessage());
}
finally
{
if (con != null)
{
try
{
con.close();
}
catch (SQLException se)
{
}
}
}
}
/**
* Make a SQL query statement for this query. It considers both domain and permit.
*
* @param indexName - table name of the index
* @param userid - user's ID
* @param group - group ID the user is in
* @param indexName - name of index table
* @param domain - industrial domain to limit
* @param permit - permitted sentences to query
* @return SQL string
*/
protected String makeQueryStatement(int userid, int group, String indexName, String tableName, String[] stems, String domain, String permit)
{
//select distinct x.f_sid,count(x.f_sid) as c from t_enzhx x,t_enzh a
//where x.f_sid=a.f_sid and a.f_permit='P' and a.f_domain='00' and
//a.f_owner=1 and f_word in ('javascript','not','java')
//group by x.f_sid order by c desc;
StringBuffer sb = new StringBuffer();
sb.append("SELECT DISTINCT x.F_SID,COUNT(x.F_SID) AS c FROM ");
sb.append(indexName);
sb.append(" x,");
sb.append(tableName);
sb.append(" a WHERE x.F_SID=a.F_SID");
//check domain, if not 00, apply
if (domain != null && !domain.equals("00"))
{
sb.append(" AND a.F_Domain='");
sb.append(domain);
sb.append('\'');
}
//check permit for 'O', 'G' + 'P'
if (permit == null || permit.equals("P"))
sb.append(" AND a.F_Permit=0");
else if (permit.equals("O"))
{
//sb.append(" AND (a.F_Permit='P' OR a.F_Permit='O' AND a.F_Owner=");
sb.append(" AND (a.F_Permit=0 OR a.F_Permit=-1 AND a.F_Owner=");
sb.append(userid);
sb.append(')');
}
else //if (permit.equals("G"))
{
//sb.append(" AND (a.F_Permit='P' OR a.F_Permit='G' AND a.F_Owner IN (SELECT F_UserID FROM T_Users WHERE F_Group=");
sb.append(" AND (a.F_Permit=0 OR a.F_Permit=");
sb.append(group);
sb.append("))");
}
sb.append(" AND F_Word IN (");
boolean first = true;
for (String w : stems)
{
if (!first)
{
sb.append(',');
}
else
{
first = false;
}
sb.append('\'');
sb.append(w);
sb.append('\'');
}
sb.append(") GROUP BY x.F_SID ORDER BY c DESC");
return sb.toString();
}
/**
* Query and return a page of example sentences from queried results.
* The query method must have been called before calling this method.
* @param which - index of page starting from 0,1,...n
* @return array list of Example objects containing source and target sentences
*/
public ArrayList<Example> getPage(int which) throws LanguageException
{
ArrayList<Example> exs = new ArrayList<Example>();
if (this.sids_.size() > which)
{
int n = (which + 1) * this.pageLen_;
if (n > this.sids_.size()) n = this.sids_.size();
for (int i=0; i<n; i++) exs.add(null);
StringBuffer sb = new StringBuffer();
sb.append("SELECT F_SID,");
if (this.swap_)
sb.append("F_Target,F_Source FROM");
else
sb.append("F_Source,F_Target FROM ");
sb.append(this.corpusName_);
sb.append(" WHERE F_SID IN (");
int x = which * this.pageLen_;
for (int i=x; i<n; i++)
{
if (i > x) sb.append(',');
sb.append(this.sids_.get(i).longValue());
}
sb.append(")");
Connection con = null;
try
{
log_.info("Corpus.getPage "+which+", SQL="+sb.toString());
con = this.dbs_.getConnection(Databases.CATEGORY_CORPUS);
Statement stmt = con.createStatement();
ResultSet rs = stmt.executeQuery(sb.toString());
while (rs.next())
{
Example e = new Example(rs.getString(2), rs.getString(3));
long sid = rs.getLong(1);
//log_.info("sid="+sid);
int pos = this.sids_.indexOf(new Long(sid));
//log_.info("pos="+pos);
exs.set(pos, e);
//log_.info("exs.set("+pos+",e)");
}
rs.close();
stmt.close();
}
catch (SQLException se)
{
throw new LanguageException("SQLException occurred: "+se.getMessage());
}
finally
{
if (con != null)
{
try
{
con.close();
}
catch (SQLException se)
{
}
}
}
}
return exs;
}
/**
* Query sentences for a page of SIDs and format them into HTML list.
* @param which - page index
* @return string of html-tagged examples
*/
public String getPageAsHtml(int which) throws LanguageException
{
ArrayList<Example> exs = getPage(which);
StringBuffer sb = new StringBuffer();
int x = which * this.pageLen_ + 1;
//prepend the page number as a string
sb.append(x);
sb.append('/');
sb.append(this.sids_.size());
sb.append('*');
//
sb.append("<table cellspacing=0 cellpadding=0>");
for (Example e : exs)
{
if (e != null) {
sb.append("<tr><td>");
sb.append(x);
sb.append(".</td><td><font color=#C66A00>");
sb.append(e.getSource());
sb.append("</font></td></tr><tr><td valign=\"bottom\"><img src=\"images/b11.gif\" id=\"x");
sb.append(x);
sb.append("\"/></td><td id=\"ehx");
sb.append(x);
sb.append("\"><font color=#003399>");
sb.append(e.getTarget());
sb.append("</font></td></tr>");
x ++;
}
}
sb.append("</table>");
return sb.toString();
}
}