package com.transmem.doc;
import java.io.IOException;
import java.io.File;
import java.util.logging.Logger;
import javax.xml.parsers.SAXParserFactory;
import javax.xml.parsers.SAXParser;
import org.xml.sax.helpers.DefaultHandler;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import javax.xml.parsers.ParserConfigurationException;
/**
* TMX Loader class parses a TMX file and sort out sentence pairs and call the specified
* unit saver to save the pair. If the TMX contains more than two languages, only the
* first two are loaded. The xml:lang attribute must exist on the tuv element.
* A valid TMX file should follow the standard, with the following essential tags:
* <?xml encoding="UTF-8"?>
* <tmx>
* <header>
* <prop type="Domain">the domain id such as IT</prop>
* <prop type="Source">(-1)what dictionary</prop>
* </head>
* <body>
* <tu tuid="xx">
* <tuv xml:lang="EN">
* <seg><![CDATA[sentence...]]></seg>
* </tuv>
* <tuv xml:lang="ZH">
* <seg>...</seg>
* </tuv>
* </tu>
* </body>
*/
public class TmxLoader extends DefaultHandler
{
public static final Logger log_ = Logger.getLogger(TmxLoader.class.getName());
private static final int E_NONE = 0;
private static final int E_TUV = 1;
private static final int E_SEG = 2;
private static final int E_PROP = 3;
private static final int E_SRC = 16;
private static final int E_DST = 32; //ORed on E_TUV and E_SEG to tell src or dst
private static final int A_DOMAIN = 5;
private static final int A_SOURCE = 6;
private IUnitSaver saver_ = null;
private String[] langs_ = new String[2];
private StringBuffer sb_;
private String src_, dst_;
private int status_, ptype_, tuvcount_;
private boolean first_ = true;
public TmxLoader(String filename, IUnitSaver saver) throws IOException
{
this.saver_ = saver;
parseTmxFile(filename);
}
/**
* Parse an TMX file using the SAX parser in order to process really large TMX files.
*/
public void parseTmxFile(String filename) throws IOException
{
try
{
SAXParserFactory factory = SAXParserFactory.newInstance();
SAXParser parser = factory.newSAXParser();
File f = new File(filename);
parser.parse(f, this);
}
catch (org.xml.sax.SAXException e)
{
throw new IOException("SAXException: "+e.toString());
}
catch (IOException ioe)
{
throw ioe;
}
catch (javax.xml.parsers.ParserConfigurationException x)
{
throw new IOException("ParserConfigurationException: "+x.toString());
}
}
public void startDocument() throws SAXException
{
//System.out.println("startDocument()");
if (this.saver_ != null)
this.saver_.start();
}
public void endDocument() throws SAXException
{
//System.out.println("endDocument()");
if (this.saver_ != null)
this.saver_.end();
}
public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException
{
if (qName.equals("tuv"))
{
this.tuvcount_ ++;
this.status_ = E_TUV;
String lang = atts.getValue("xml:lang");
if (lang == null)
{
throw new SAXException("TMX tuv element contains no xml:lang attribute");
}
if (this.first_)
{
if (this.tuvcount_ < 3)
{
String langcode = lang.substring(0,2).toUpperCase();
switch (this.tuvcount_)
{
case 1:
this.status_ |= E_SRC;
saver_.setProperty("srclang", langcode);
this.langs_[0] = langcode;
break;
case 2:
this.status_ |= E_DST;
saver_.setProperty("dstlang", langcode);
this.langs_[1] = langcode;
this.first_ = false;
break;
}
}
}
else
{
if (lang.equals(this.langs_[0]))
this.status_ |= E_SRC;
else if (lang.equals(this.langs_[1]))
this.status_ |= E_DST;
}
}
else if (qName.equals("seg"))
{
this.status_ = E_SEG | this.status_ & 0xF0; //previous status should be E_TUV
this.sb_ = new StringBuffer();
}
else if (qName.equals("tu"))
{
this.tuvcount_ = 0;
}
else if (qName.equals("prop"))
{
this.status_ = E_PROP;
String sType = atts.getValue("type");
if (sType.equalsIgnoreCase("Domain"))
this.ptype_ = A_DOMAIN;
else if (sType.equalsIgnoreCase("Source"))
this.ptype_ = A_SOURCE;
else
this.ptype_ = 0;
//System.out.println("<prop type="+sType+">");
}
}
public void endElement(String uri, String localName, String qName) throws SAXException
{
//System.out.println("endElement("+qName+")");
//if (from_ > 10) System.exit(0);
if ((this.status_ & E_SEG) == E_SEG)
{
if ((this.status_ & E_SRC) == E_SRC)
{
//source language sentence
this.src_ = this.sb_.toString();
}
else if ((this.status_ & E_DST) == E_DST)
{
this.dst_ = this.sb_.toString();
}
if (src_ != null && dst_ != null && src_.length() > 0 && dst_.length() > 0)
{
if (this.saver_ != null)
{
try
{
this.saver_.saveUnit(this.src_, this.dst_);
}
catch (java.sql.SQLException sqle)
{
log_.severe("SQLException in saveUnit: "+sqle.toString());
throw new SAXException("SQLException in saveUnit: "+sqle.toString());
}
}
this.src_ = this.dst_ = null;
}
//System.out.println(this.sb_.toString());
this.status_ = E_NONE;
}
}
/**
* SAX event handler to receive characters from elements.
*/
public void characters(char[] ch, int start, int length) throws SAXException
{
//System.out.println("characters: "+length);
if (((this.status_ & E_SEG) == E_SEG) && ((this.status_ & 0xF0) > 0))
{
this.sb_.append(ch, start, length);
}
else if (this.status_ == E_PROP)
{
//System.out.println(new String(ch,start,length));
if (this.ptype_ == A_DOMAIN)
{
//this.domain_ = new String(ch, start, length);
if (this.saver_ != null)
this.saver_.setProperty("Domain", new String(ch, start, length));
}
else if (this.ptype_ == A_SOURCE)
{
String s = new String(ch, start, length);
int n1 = s.indexOf('(');
int n2 = s.indexOf(')');
String sn = s.substring(n1+1, n2);
try
{
//this.from_ = Integer.parseInt(sn);
if (this.saver_ != null)
this.saver_.setProperty("Source", sn);
}
catch (Exception e)
{
}
//this.sourceName_ = s.substring(n2+1);
}
this.status_ = E_NONE;
}
}
}