/************************************************************************** OmegaT - Computer Assisted Translation (CAT) tool with fuzzy matching, translation memory, keyword search, glossaries, and translation leveraging into updated projects. Copyright (C) 2000-2006 Keith Godfrey, Maxym Mykhalchuk, and Henry Pijffers Portions copyright 2007 Zoltan Bartko - bartkozoltan@bartkozoltan.com 2009 Alex Buloichik Home page: http://www.omegat.org/ Support center: http://groups.yahoo.com/group/OmegaT/ This file is part of OmegaT. OmegaT is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. OmegaT is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>. **************************************************************************/ package org.omegat.util; import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStreamWriter; import java.io.PrintWriter; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.omegat.core.data.PrepareTMXEntry; import org.omegat.core.data.ProjectProperties; /** * Class that store TMX (Translation Memory Exchange) files. * * @author Henry Pijffers (henry.pijffers@saxnot.com) * @author Maxym Mykhalchuk * */ public class TMXWriter { /** * Saves a TMX file to disk * * @param filename * The name of the file to create * @param forceValidTMX * When true, OmegaT-tags are stripped from the segments. * @param levelTwo * When true, the tmx is made compatible with level 2 (TMX * version 1.4) * @param m_config * Project configuration, to get the languages * @param data * Data for save to TMX, a map of {source segments, translation} * @throws IOException */ public static void buildTMXFile(final String filename, final boolean forceValidTMX, final boolean levelTwo, final ProjectProperties m_config, final Map<String, PrepareTMXEntry> data) throws IOException { // we got this far, so assume lang codes are proper String sourceLocale = m_config.getSourceLanguage().toString(); String targetLocale = m_config.getTargetLanguage().toString(); String segmenting; if (m_config.isSentenceSegmentingEnabled()) { segmenting = "sentence"; } else { segmenting = "paragraph"; } FileOutputStream fos = new FileOutputStream(filename); OutputStreamWriter osw = new OutputStreamWriter(fos, "UTF-8"); PrintWriter out = new PrintWriter(osw); // PW is easier to use than // Buff.Writer String version = OStrings.VERSION; if (!OStrings.UPDATE.equals("0")) version = version + "_" + OStrings.UPDATE; // Write TMX header out.println("<?xml version=\"1.0\" encoding=\"UTF-8\"?>"); if (levelTwo) out.println("<!DOCTYPE tmx SYSTEM \"tmx14.dtd\">"); else out.println("<!DOCTYPE tmx SYSTEM \"tmx11.dtd\">"); if (levelTwo) out.println("<tmx version=\"1.4\">"); else out.println("<tmx version=\"1.1\">"); out.println(" <header"); out.println(" creationtool=\"OmegaT\""); out.println(" creationtoolversion=\"" + version + "\""); out.println(" segtype=\"" + segmenting + "\""); out.println(" o-tmf=\"OmegaT TMX\""); out.println(" adminlang=\"EN-US\""); out.println(" srclang=\"" + sourceLocale + "\""); out.println(" datatype=\"plaintext\""); out.println(" >"); out.println(" </header>"); out.println(" <body>"); // Determine language attribute to use String langAttr = levelTwo ? "xml:lang" : "lang"; // Write TUs String source = null; String target = null; String note = null; TMXDateParser dateParser = new TMXDateParser(); for (Map.Entry<String, PrepareTMXEntry> en : data.entrySet()) { PrepareTMXEntry transEntry = en.getValue(); source = forceValidTMX ? TagUtil.stripXmlTags(en.getKey()) : en.getKey(); target = forceValidTMX ? TagUtil.stripXmlTags(transEntry.translation) : transEntry.translation; source = StringUtil.makeValidXML(source); target = StringUtil.makeValidXML(target); if (transEntry.note != null) { note = forceValidTMX ? TagUtil.stripXmlTags(transEntry.note) : transEntry.note; note = StringUtil.makeValidXML(note); } // TO DO: This *possibly* converts occurrences in the actual text of // <fX> // which it should not. if (levelTwo) { source = makeLevelTwo(source); target = makeLevelTwo(target); } String changeIdPropertyString = (transEntry.changer != null && !"".equals(transEntry.changer) ? " changeid=\"" + transEntry.changer + "\"" : ""); String changeDatePropertyString = (transEntry.changeDate != 0 ? " changedate=\"" + dateParser.getTMXDate(transEntry.changeDate) + "\"" : ""); out.println(" <tu>"); out.println(" <tuv " + langAttr + "=\"" + sourceLocale + "\">"); out.println(" <seg>" + source + "</seg>"); out.println(" </tuv>"); out.println(" <tuv " + langAttr + "=\"" + targetLocale + "\"" + changeDatePropertyString + changeIdPropertyString + ">"); out.println(" <seg>" + target + "</seg>"); out.println(" </tuv>"); if (note != null) { out.println(" <note>" + note + "</note>"); } out.println(" </tu>"); } // Write TMX footer out.println(" </body>"); out.println("</tmx>"); // Close output stream out.close(); } /** * Creates three-quarted-assed TMX level 2 segments from OmegaT internal * segments */ private static String makeLevelTwo(String segment) { // Create a storage buffer for the result StringBuilder result = new StringBuilder(segment.length() * 2); // Find all single tags // Matcher match = // Pattern.compile("<[a-zA-Z\-]+\\d+/>").matcher(segment); Matcher match = Pattern.compile("<[\\S&&[^/\\d]]+(\\d+)/>").matcher(segment); int previousMatchEnd = 0; while (match.find()) { // get the OmegaT tag and tag number String tag = match.group(); String tagNumber = match.group(1); // Wrap the OmegaT tag in TMX tags in the result result.append(segment.substring(previousMatchEnd, match.start())); // text // betw. // prev. // & // cur. // match result.append("<ph x='"); // TMX start tag + i attribute result.append(tagNumber); // OmegaT tag number used as x attribute result.append("'>"); result.append(tag); // OmegaT tag result.append("</ph>"); // TMX end tag // Store the current match's end positions previousMatchEnd = match.end(); } // Append the text from the last match (single tag) to the end of the // segment result.append(segment.substring(previousMatchEnd, segment.length())); segment = result.toString(); // Store intermediate result back in // segment result.setLength(0); // Clear result buffer // Find all start tags match = Pattern.compile("<[\\S&&[^/\\d]]+(\\d+)>").matcher(segment); previousMatchEnd = 0; while (match.find()) { // get the OmegaT tag and tag number String tag = match.group(); String tagNumber = match.group(1); // Check if the corresponding end tag is in this segment too String endTag = "</" + tag.substring(4); boolean paired = segment.indexOf(endTag) > -1; // Wrap the OmegaT tag in TMX tags in the result result.append(segment.substring(previousMatchEnd, match.start())); // text // betw. // prev. // & // cur. // match if (paired) { result.append("<bpt i='"); // TMX start tag + i attribute result.append(tagNumber); // OmegaT tag number used as i // attribute result.append("'"); } else { result.append("<it pos='begin'"); // TMX start tag } result.append(" x='"); // TMX x attribute result.append(tagNumber); // OmegaT tag number used as x attribute result.append("'>"); result.append(tag); // OmegaT tag result.append(paired ? "</bpt>" : "</it>"); // TMX end tag // Store the current match's end positions previousMatchEnd = match.end(); } // Append the text from the last match (start tag) to the end of the // segment result.append(segment.substring(previousMatchEnd, segment.length())); segment = result.toString(); // Store intermediate result back in // segment result.setLength(0); // Clear result buffer // Find all end tags match = Pattern.compile("</[\\S&&[^\\d]]+(\\d+)>").matcher(segment); previousMatchEnd = 0; while (match.find()) { // get the OmegaT tag and tag number String tag = match.group(); String tagNumber = match.group(1); // Check if the corresponding start tag is in this segment too String startTag = "<" + tag.substring(5); boolean paired = segment.indexOf(startTag) > -1; // Wrap the OmegaT tag in TMX tags in the result result.append(segment.substring(previousMatchEnd, match.start())); // text // betw. // prev. // & // cur. // match result.append(paired ? "<ept i='" : "<it pos='end' x='"); // TMX // start // tag + // i/x // attribute result.append(tagNumber); // OmegaT tag number used as i/x attribute result.append("'>"); result.append(tag); // OmegaT tag result.append(paired ? "</ept>" : "</it>"); // TMX end tag // Store the current match's end positions previousMatchEnd = match.end(); } // Append the text from the last match (end tag) to the end of the // segment result.append(segment.substring(previousMatchEnd, segment.length())); // Done, return result return result.toString(); } }