/**************************************************************************
OmegaT - Computer Assisted Translation (CAT) tool
with fuzzy matching, translation memory, keyword search,
glossaries, and translation leveraging into updated projects.
Copyright (C) 2006 Henry Pijffers
2010 Alex Buloichik
2011 Alex Buloichik, Martin Fleurke
2012 Alex Buloichik, Didier Briel
2013 Aaron Madlon-Kay
Home page: http://www.omegat.org/
Support center: http://groups.yahoo.com/group/OmegaT/
This file is part of OmegaT.
OmegaT is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
OmegaT is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
**************************************************************************/
package org.omegat.util;
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.OutputStream;
import java.nio.charset.StandardCharsets;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.List;
import java.util.Locale;
import java.util.TimeZone;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.xml.stream.XMLOutputFactory;
import javax.xml.stream.XMLStreamWriter;
import org.omegat.core.data.TMXEntry;
/**
* Helper for write TMX files, using StAX.
*
* We can't use JAXB for writing because it changes spaces on formatted output.
*
* @author Alex Buloichik (alex73mail@gmail.com)
* @author Martin Fleurke
* @author Didier Briel
* @author Aaron Madlon-Kay
*/
public class TMXWriter2 {
static String LINE_SEPARATOR = System.lineSeparator();
private static XMLOutputFactory FACTORY;
private final OutputStream out;
private final XMLStreamWriter xml;
private final String langSrc, langTar;
private final boolean levelTwo;
private final boolean forceValidTMX;
/**
* DateFormat with format YYYYMMDDThhmmssZ able to display a date in UTC time.
*
* SimpleDateFormat IS NOT THREAD SAFE !!!
*/
private final SimpleDateFormat tmxDateFormat;
static {
FACTORY = XMLOutputFactory.newInstance();
}
/**
*
* @param file
* @param sourceLanguage
* @param targetLanguage
* @param sentenceSegmentingEnabled
* @param levelTwo
* When true, the tmx is made compatible with level 2 (TMX version 1.4)
* @param callback
* @throws Exception
*/
public TMXWriter2(File file, final Language sourceLanguage, final Language targetLanguage,
boolean sentenceSegmentingEnabled, boolean levelTwo, boolean forceValidTMX) throws Exception {
this.levelTwo = levelTwo;
this.forceValidTMX = forceValidTMX;
out = new BufferedOutputStream(new FileOutputStream(file));
xml = FACTORY.createXMLStreamWriter(out, StandardCharsets.UTF_8.name());
xml.writeStartDocument(StandardCharsets.UTF_8.name(), "1.0");
xml.writeCharacters(LINE_SEPARATOR);
if (levelTwo) {
xml.writeDTD("<!DOCTYPE tmx SYSTEM \"tmx14.dtd\">");
xml.writeCharacters(LINE_SEPARATOR);
xml.writeStartElement("tmx");
xml.writeAttribute("version", "1.4");
} else {
xml.writeDTD("<!DOCTYPE tmx SYSTEM \"tmx11.dtd\">");
xml.writeCharacters(LINE_SEPARATOR);
xml.writeStartElement("tmx");
xml.writeAttribute("version", "1.1");
}
xml.writeCharacters(LINE_SEPARATOR);
writeHeader(sourceLanguage, targetLanguage, sentenceSegmentingEnabled);
xml.writeCharacters(" ");
xml.writeStartElement("body");
xml.writeCharacters(LINE_SEPARATOR);
langSrc = sourceLanguage.toString();
langTar = targetLanguage.toString();
tmxDateFormat = new SimpleDateFormat("yyyyMMdd'T'HHmmss'Z'", Locale.ENGLISH);
tmxDateFormat.setTimeZone(TimeZone.getTimeZone("UTC"));
}
public void close() throws Exception {
try {
xml.writeCharacters(" ");
xml.writeEndElement(); // body
xml.writeCharacters(LINE_SEPARATOR);
xml.writeEndElement(); // tmx
xml.writeCharacters(LINE_SEPARATOR);
xml.writeEndDocument();
} finally {
xml.close();
out.close();
}
}
public void writeComment(String comment) throws Exception {
xml.writeComment(comment);
xml.writeCharacters(LINE_SEPARATOR);
}
/**
* Write one entry.
*
* @param source
* @param translation
* @param propValues
* pairs with property name and values
*/
public void writeEntry(String source, String translation, TMXEntry entry, List<String> propValues)
throws Exception {
writeEntry(source, translation, entry.note, entry.creator, entry.creationDate, entry.changer, entry.changeDate,
propValues);
}
public void writeEntry(String source, String translation, String note, String creator, long creationDate,
String changer, long changeDate, List<String> propValues) throws Exception {
if (source == null && translation == null) {
throw new NullPointerException(
"The TMX spec requires at least one <tuv> per <tu>. Source and translation can't both be null.");
}
xml.writeCharacters(" ");
xml.writeStartElement("tu");
xml.writeCharacters(LINE_SEPARATOR);
// add properties
if (propValues != null) {
for (int i = 0; i < propValues.size(); i += 2) {
String value = propValues.get(i + 1);
if (value == null) {
// value is null - not need to write
continue;
}
xml.writeCharacters(" ");
xml.writeStartElement("prop");
String type = StringUtil.removeXMLInvalidChars(propValues.get(i));
xml.writeAttribute("type", type);
xml.writeCharacters(StringUtil.removeXMLInvalidChars(value));
xml.writeEndElement(); // prop
xml.writeCharacters(LINE_SEPARATOR);
}
}
// add note
if (!StringUtil.isEmpty(note)) {
note = StringUtil.removeXMLInvalidChars(note);
if (forceValidTMX) {
note = TagUtil.stripXmlTags(note);
}
xml.writeCharacters(" ");
xml.writeStartElement("note");
xml.writeCharacters(platformLineSeparator(note));
xml.writeEndElement(); // note
xml.writeCharacters(LINE_SEPARATOR);
}
// write source segment
if (source != null) {
source = StringUtil.removeXMLInvalidChars(source);
if (forceValidTMX) {
source = TagUtil.stripXmlTags(source);
}
xml.writeCharacters(" ");
xml.writeStartElement("tuv");
if (levelTwo) {
xml.writeAttribute("xml", "", "lang", langSrc);
} else {
xml.writeAttribute("lang", langSrc);
}
xml.writeCharacters(LINE_SEPARATOR);
if (levelTwo) {
writeLevelTwo(platformLineSeparator(source));
} else {
writeLevelOne(platformLineSeparator(source));
}
xml.writeCharacters(LINE_SEPARATOR);
xml.writeCharacters(" ");
xml.writeEndElement(); // tuv
xml.writeCharacters(LINE_SEPARATOR);
}
// write target segment
if (translation != null) {
translation = StringUtil.removeXMLInvalidChars(translation);
if (forceValidTMX) {
translation = TagUtil.stripXmlTags(translation);
}
xml.writeCharacters(" ");
xml.writeStartElement("tuv");
if (levelTwo) {
xml.writeAttribute("xml", "", "lang", langTar);
} else {
xml.writeAttribute("lang", langTar);
}
if (!StringUtil.isEmpty(changer)) {
xml.writeAttribute("changeid", changer);
}
if (changeDate > 0) {
xml.writeAttribute("changedate", tmxDateFormat.format(new Date(changeDate)));
}
if (!StringUtil.isEmpty(creator)) {
xml.writeAttribute("creationid", creator);
}
if (creationDate > 0) {
xml.writeAttribute("creationdate", tmxDateFormat.format(new Date(creationDate)));
}
xml.writeCharacters(LINE_SEPARATOR);
if (levelTwo) {
writeLevelTwo(platformLineSeparator(translation));
} else {
writeLevelOne(platformLineSeparator(translation));
}
xml.writeCharacters(LINE_SEPARATOR);
xml.writeCharacters(" ");
xml.writeEndElement(); // tuv
xml.writeCharacters(LINE_SEPARATOR);
}
xml.writeCharacters(" ");
xml.writeEndElement(); // tu
xml.writeCharacters(LINE_SEPARATOR);
}
private void writeHeader(final Language sourceLanguage, final Language targetLanguage,
boolean sentenceSegmentingEnabled) throws Exception {
xml.writeCharacters(" ");
xml.writeEmptyElement("header");
xml.writeAttribute("creationtool", OStrings.getApplicationName());
xml.writeAttribute("o-tmf", "OmegaT TMX");
xml.writeAttribute("adminlang", "EN-US");
xml.writeAttribute("datatype", "plaintext");
xml.writeAttribute("creationtoolversion", OStrings.getVersion());
xml.writeAttribute("segtype", sentenceSegmentingEnabled ? "sentence" : "paragraph");
xml.writeAttribute("srclang", sourceLanguage.toString());
xml.writeCharacters(LINE_SEPARATOR);
}
/**
* Create simple segment.
*/
private void writeLevelOne(String segment) throws Exception {
xml.writeCharacters(" ");
xml.writeStartElement("seg");
xml.writeCharacters(segment);
xml.writeEndElement();
}
protected static final Pattern TAGS_ANY = Pattern.compile("<(/?)([\\S&&[^/\\d]]+)(\\d+)(/?)>");
enum TAG_TYPE {
SINGLE, START, END
};
private void writeLevelTwo(String segment) throws Exception {
xml.writeCharacters(" ");
xml.writeStartElement("seg");
TAG_TYPE tagType;
int pos = 0;
Matcher m = TAGS_ANY.matcher(segment);
while (true) {
if (!m.find(pos)) {
break;
}
xml.writeCharacters(segment.substring(pos, m.start()));
pos = m.end();
if (!m.group(1).isEmpty()) {
tagType = TAG_TYPE.END;
} else if (!m.group(4).isEmpty()) {
tagType = TAG_TYPE.SINGLE;
} else {
tagType = TAG_TYPE.START;
}
String tagName = m.group(2);
String tagNumber = m.group(3);
switch (tagType) {
case SINGLE:
xml.writeStartElement("ph");
xml.writeAttribute("x", tagNumber);
xml.writeCharacters(m.group());
xml.writeEndElement();
break;
case START:
String endTag = "</" + tagName + tagNumber + ">";
if (segment.contains(endTag)) {
xml.writeStartElement("bpt");
xml.writeAttribute("i", tagNumber);
xml.writeAttribute("x", tagNumber);
xml.writeCharacters(m.group());
xml.writeEndElement();
} else {
xml.writeStartElement("it");
xml.writeAttribute("pos", "begin");
xml.writeAttribute("x", tagNumber);
xml.writeCharacters(m.group());
xml.writeEndElement();
}
break;
case END:
String startTag = "<" + tagName + tagNumber + ">";
if (segment.contains(startTag)) {
xml.writeStartElement("ept");
xml.writeAttribute("i", tagNumber);
xml.writeCharacters(m.group());
xml.writeEndElement();
} else {
xml.writeStartElement("it");
xml.writeAttribute("pos", "end");
xml.writeAttribute("x", tagNumber);
xml.writeCharacters(m.group());
xml.writeEndElement();
}
break;
default:
throw new RuntimeException("Unknow tag type");
}
}
xml.writeCharacters(segment.substring(pos));
xml.writeEndElement();
}
/**
* Replaces \n with platform specific end of lines
* @param text The string to be converted
* @return The converted string
*/
private String platformLineSeparator(String text) {
return text.replace("\n", LINE_SEPARATOR);
}
}