package loader;
import java.sql.*;
//import javax.xml.parsers.*;
//import org.w3c.dom.*;
//import org.xml.sax.*;
import java.io.IOException;
import java.io.File;
import java.util.Map;
import java.util.HashMap;
import javax.xml.parsers.SAXParserFactory;
import javax.xml.parsers.SAXParser;
import org.xml.sax.helpers.DefaultHandler;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import javax.xml.parsers.ParserConfigurationException;
import com.transmem.nlp.*;
/**
* Customized exception object
*/
class LoadException extends Exception
{
public static final long serialVersionUID = -1;
public LoadException(String msg)
{
super(msg);
}
}
/**
* Load a TMX file into the example database.
*
* @author Ted Wen
* @update April, 2007
*
* This module works for English-Chinese sentences only.
* The TMX tu elements contain attribute xml:lang="EN" and ZH respectively.
* Other language units are ignored.
* The sentence and translation are inserted into table T_ENZH.
* Words are stored in the T_ENZHX and T_ZHENX index tables.
* Segmentation and stemming done by com.transmem.nlp.* modules.
*
* Note: to use another host, change the getConnection paramter 'localhost' in openDatabase() method.
* To enforce security, change username and password from postgres to another restricted user account.
*/
public class LoadTmx extends DefaultHandler
{
private Connection conn_ = null;
private ResultSet rs_ = null;
private ResultSet rs1_ = null;
private ResultSet rs2_ = null;
private ILinguist english_;
private ILinguist chinese_;
private String domain_ = "00";
private int from_ = 0;
private String sourceName_;
private String lang_;
private StringBuffer sb_;
private String en_, zh_;
private int status_, ptype_;
private static final int E_NONE = 0;
private static final int E_TUV = 1;
private static final int E_SEG = 2;
private static final int E_PROP = 3;
private static final int A_DOMAIN = 5;
private static final int A_SOURCE = 6;
public LoadTmx(String filename) throws LoadException
{
//loadTmxFile(filename);
parseTmxFile(filename);
}
protected long getNextSid() throws LoadException
{
try
{
String sql = "SELECT nextval('S_ENZH')";
Statement stmt = this.conn_.createStatement();
ResultSet rs = stmt.executeQuery(sql);
long sid = 0;
if (rs.next())
sid = rs.getLong(1);
rs.close();
stmt.close();
return sid;
}
catch (SQLException se)
{
throw new LoadException("SQLException occurred: "+se.getMessage());
}
}
protected long saveUnit(String en, String zh) throws LoadException
{
if (this.rs_ == null)
{
String sql = "SELECT * FROM T_ENZH WHERE F_SID = 0";
try
{
Statement stmt = this.conn_.createStatement(ResultSet.TYPE_SCROLL_INSENSITIVE,ResultSet.CONCUR_UPDATABLE);
this.rs_ = stmt.executeQuery(sql);
Statement stmt1 = this.conn_.createStatement(ResultSet.TYPE_SCROLL_INSENSITIVE,ResultSet.CONCUR_UPDATABLE);
this.rs1_ = stmt1.executeQuery("SELECT * FROM T_ENZHX WHERE F_Word='x'");
Statement stmt2 = this.conn_.createStatement(ResultSet.TYPE_SCROLL_INSENSITIVE,ResultSet.CONCUR_UPDATABLE);
this.rs2_ = stmt2.executeQuery("SELECT * FROM T_ZHENX WHERE F_Word='x'");
}
catch (SQLException se)
{
throw new LoadException("SQLException occurred: "+se.getMessage());
}
}
if (this.rs_ == null)
throw new LoadException("ResultSet not ready! ");
long sid = getNextSid();
//System.out.println("SID = "+sid);
try
{
this.rs_.moveToInsertRow();
this.rs_.updateLong("F_SID", sid);
this.rs_.updateString("F_Source", en);
this.rs_.updateString("F_Target", zh);
this.rs_.updateString("F_Domain", this.domain_);
this.rs_.updateInt("F_From", this.from_);
//this.rs_.updateString("F_Permit", "P"); //default
//this.rs_.updateInt("F_Owner", 0); //default
this.rs_.insertRow();
return sid;
}
catch (SQLException se)
{
throw new LoadException("SQLException occurred: "+se.getMessage());
}
}
protected void saveSource() throws LoadException
{
PreparedStatement ps = null;
try {
String sql = "INSERT INTO T_Sources(F_SourceID,F_Name) VALUES(?,?)";
ps = this.conn_.prepareStatement(sql);
ps.setInt(1,this.from_);
String s = (this.sourceName_==null)?"":this.sourceName_;
ps.setString(2,s);
ps.executeUpdate();
}
catch (SQLException e)
{
throw new LoadException(e.getMessage());
}
finally
{
if (ps != null)
try { ps.close(); } catch (SQLException x) {}
}
}
protected void index(ResultSet rs, ILinguist linguist, long sid, String sent) throws LoadException
{
if (rs == null)
throw new LoadException("Index ResultSet null");
try
{
//System.out.println(sent);
String[] words = linguist.indexkeys(sent);
if (words.length > 0)
{
Map<String,String> dup = new HashMap<String,String>();
int i = 0;
for (String word : words)
{
//System.out.print(word+" ");
if (!dup.containsKey(word))
{
dup.put(word, word);
rs.moveToInsertRow();
rs.updateString("F_Word", word);
rs.updateLong("F_SID", sid);
rs.updateInt("F_Offset", i++);
rs.insertRow();
}
}
//System.out.println();
}
}
catch (SQLException se)
{
throw new LoadException("SQLException occurred: "+se.getMessage());
}
catch (LanguageException le)
{
throw new LoadException("LanguageException occurred indexing: "+le.getMessage());
}
}
protected void openDatabase() throws LoadException
{
try
{
conn_ = DriverManager.getConnection("jdbc:postgresql://localhost/transmem","tm", "yi4ku4wh3");
conn_.setAutoCommit(false);
}
catch (SQLException se)
{
throw new LoadException("SQLException occurred: "+se.getMessage());
}
}
protected void closeDatabase()
{
if (this.conn_ != null)
{
try
{
conn_.close();
}
catch (SQLException e)
{
}
}
}
protected void createIndexer() throws LoadException
{
try
{
LanguageManager.loadLangNames();
english_ = LanguageManager.createLinguist("EN", LanguageManager.INDEXER);
chinese_ = LanguageManager.createLinguist("ZH", LanguageManager.INDEXER);
}
catch (LanguageException le)
{
throw new LoadException("LanguageException occurred while creating linguist: "+le.getMessage());
}
}
public void startDocument() throws SAXException
{
//System.out.println("startDocument()");
try
{
openDatabase();
}
catch (LoadException le)
{
System.err.println(le.getMessage());
throw new SAXException(le.getMessage());
}
try
{
createIndexer();
}
catch (LoadException le)
{
closeDatabase();
System.err.println(le.getMessage());
throw new SAXException(le.getMessage());
}
}
public void endDocument() throws SAXException
{
//System.out.println("endDocument()");
try
{
conn_.commit();
closeDatabase();
}
catch (SQLException sqle)
{
throw new SAXException(sqle.toString());
}
}
public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException
{
//System.out.println("startElement("+qName+","+atts.getLength()+")");
//from_ ++;
//if (qName.equals("tmx")) this.isTmx_ = true;
if (qName.equals("tuv"))
{
this.status_ = E_TUV;
this.lang_ = atts.getValue("xml:lang");
}
else if (qName.equals("seg"))
{
this.status_ = E_SEG;
this.sb_ = new StringBuffer();
}
else if (qName.equals("prop"))
{
this.status_ = E_PROP;
String sType = atts.getValue("type");
if (sType.equalsIgnoreCase("Domain"))
this.ptype_ = A_DOMAIN;
else if (sType.equalsIgnoreCase("Source"))
this.ptype_ = A_SOURCE;
else
this.ptype_ = 0;
//System.out.println("<prop type="+sType+">");
}
}
public void endElement(String uri, String localName, String qName) throws SAXException
{
//System.out.println("endElement("+qName+")");
//if (from_ > 10) System.exit(0);
if (this.status_ == E_SEG)
{
//save sentence
if (this.lang_.equalsIgnoreCase("EN"))
{
this.en_ = this.sb_.toString();
}
else if (this.lang_.equalsIgnoreCase("ZH"))
{
this.zh_ = this.sb_.toString();
}
if (zh_ != null && en_ != null && en_.length()>0 && zh_.length()>0 && en_.length()<2000 && zh_.length()<2000)
{
try
{
long sid = saveUnit(en_, zh_);
index(this.rs1_, english_, sid, en_);
}
catch (LoadException le)
{
throw new SAXException(le.toString());
}
this.en_ = this.zh_ = null;
}
//System.out.println(this.sb_.toString());
this.status_ = E_NONE;
}
else if (qName.equals("header"))
{
System.out.print("saveSource: domain="+this.domain_+", from="+this.from_);
try
{
saveSource();
} catch (LoadException le)
{
throw new SAXException(le.toString());
}
}
}
public void characters(char[] ch, int start, int length) throws SAXException
{
//System.out.println("characters: "+length);
if (this.status_ == E_SEG)
{
this.sb_.append(ch, start, length);
}
else if (this.status_ == E_PROP)
{
//System.out.println(new String(ch,start,length));
if (this.ptype_ == A_DOMAIN)
{
this.domain_ = new String(ch, start, length);
}
else if (this.ptype_ == A_SOURCE)
{
String s = new String(ch, start, length);
int n1 = s.indexOf('(');
int n2 = s.indexOf(')');
String sn = s.substring(n1+1, n2);
try {
this.from_ = Integer.parseInt(sn);
} catch (Exception e)
{
}
this.sourceName_ = s.substring(n2+1);
}
this.status_ = E_NONE;
}
}
/**
* Parse an XML (TMX) file using the SAX parser in order to process really large TMX files.
*/
public void parseTmxFile(String filename) throws LoadException
{
try {
SAXParserFactory factory = SAXParserFactory.newInstance();
SAXParser parser = factory.newSAXParser();
File f = new File(filename);
parser.parse(f, this);
}
catch (org.xml.sax.SAXException e)
{
if (this.conn_ != null)
{
closeDatabase();
}
throw new LoadException(e.getMessage());
}
catch (IOException ioe)
{
throw new LoadException(ioe.getMessage());
}
catch (javax.xml.parsers.ParserConfigurationException x)
{
throw new LoadException(x.toString());
}
}
/*
public void loadTmxFile(String filename) throws LoadException
{
try
{
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
DocumentBuilder builder = factory.newDocumentBuilder();
Document doc = builder.parse(filename);
Element root = doc.getDocumentElement();
//System.out.println(root.getNodeName());
if (!root.getNodeName().equals("tmx"))
{
throw new LoadException("Not a valid TMX file.");
}
NodeList nodes = root.getElementsByTagName("header");
if (nodes != null && nodes.getLength() > 0)
{
Node header = nodes.item(0);
nodes = ((Element)header).getElementsByTagName("prop");
for (int i=0; i<nodes.getLength(); i++)
{
Element nd = (Element)nodes.item(i);
if (nd.getAttribute("type").equalsIgnoreCase("Domain"))
{
this.domain_ = nd.getFirstChild().getNodeValue();
//System.out.println(this.domain_);
}
else if (nd.getAttribute("type").equalsIgnoreCase("Source"))
{
String s = nd.getFirstChild().getNodeValue();
int n1 = s.indexOf('(');
int n2 = s.indexOf(')',n1+1);
String sn = s.substring(n1+1,n2);
this.from_ = Integer.parseInt(sn); //should be negative int
if (this.from_ > 0) {
throw new LoadException("Source "+sn+" not negative");
}
this.sourceName_ = s.substring(n2+1);
//INSERT INTO T_Sources(F_SourceID,F_Name) VALUES(this.from_,sourceName_);
}
}
}
else
this.domain_ = "00"; //00=general
//open database connection
openDatabase();
//save t_sources
if (this.from_ < 0)
saveSource();
//create indexer
createIndexer();
NodeList tus = root.getElementsByTagName("tu");
//System.out.println(tus.getLength());
for (int i=0; i<tus.getLength(); i++)
{
Node tuv = tus.item(i);
if (tuv instanceof Element)
{
String en, zh;
en = zh = null;
NodeList tuvs = ((Element)tuv).getElementsByTagName("tuv");
for (int k=0; k<tuvs.getLength(); k++)
{
Node t = tuvs.item(k);
if (t instanceof Element)
{
Element te = (Element)t;
//System.out.println(te.getAttribute("xml:lang"));
if (te.getAttribute("xml:lang").equals("EN"))
{
NodeList segs = te.getElementsByTagName("seg");
Node node = segs.item(0);
//System.out.println(node.getNodeName());
//System.out.println(node.getFirstChild().getNodeValue());
en = node.getFirstChild().getNodeValue();
}
else if (te.getAttribute("xml:lang").equals("ZH"))
{
NodeList segs = te.getElementsByTagName("seg");
Node node = segs.item(0);
if (node != null)
{
Node sc = node.getFirstChild();
if (sc != null)
{
zh = sc.getNodeValue();
}
}
}
}
}
if (en != null && zh != null && zh.length()>0 && en.length()<2000 && zh.length()<2000)
{
//System.out.println(en+" ==> "+zh);
long sid = saveUnit(en,zh);
index(this.rs1_, english_, sid, en);
//index(this.rs2_, chinese_, sid, zh);
}
}
}
System.out.println("Commit");
this.conn_.commit();
}
catch (SQLException sqle)
{
throw new LoadException("SQLException occurred: "+sqle.getMessage());
}
catch (IOException ioe)
{
throw new LoadException("IOException occurred: "+ioe.getMessage());
}
catch (ParserConfigurationException pce)
{
throw new LoadException("ParserConfigurationException occurred: "+pce.getMessage());
}
catch (SAXException se)
{
throw new LoadException("SAXException occurred: "+se.getMessage());
}
finally
{
closeDatabase();
}
}
*/
public static void main(String[] args)
{
try {
Class.forName("org.postgresql.Driver");
} catch (ClassNotFoundException cnfe) {
System.out.println("Can't find the PostgreSQL jdbc driver!");
System.exit(1);
}
if (args.length < 1)
{
System.out.println("Usage:\n\tjava LoadTmx <tmx_file>");
return;
}
String filename = args[0];
try
{
LoadTmx t = new LoadTmx(filename);
}
catch (Exception e)
{
e.printStackTrace();
}
}
}