/************************************************************************** OmegaT - Computer Assisted Translation (CAT) tool with fuzzy matching, translation memory, keyword search, glossaries, and translation leveraging into updated projects. Copyright (C) 2000-2006 Keith Godfrey and Maxym Mykhalchuk 2007 Didier Briel 2010 Antonio Vilei 2012 Didier Briel 2013 Alex Buloichik, Didier Briel Home page: http://www.omegat.org/ Support center: http://groups.yahoo.com/group/OmegaT/ This file is part of OmegaT. OmegaT is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. OmegaT is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>. **************************************************************************/ package org.omegat.filters3; import java.util.ArrayList; import java.util.List; import org.omegat.core.Core; import org.omegat.core.data.ProtectedPart; import org.omegat.filters2.TranslationException; import org.omegat.filters3.xml.Handler; import org.omegat.filters3.xml.XMLContentBasedTag; import org.omegat.filters3.xml.XMLDialect; import org.omegat.filters3.xml.XMLText; import org.omegat.util.StringUtil; import org.omegat.util.TagUtil; /** * Translatable entry. Holds a list of source tags and text, translated text and * maintains correspondence between tags in source and in target. * * @author Maxym Mykhalchuk * @author Didier Briel * @author Alex Buloichik (alex73mail@gmail.com) */ public class Entry { final XMLDialect xmlDialect; final Handler handler; public Entry(XMLDialect xmlDialect, Handler handler) { this.xmlDialect = xmlDialect; this.handler = handler; } /** * Cleans up this entry. */ public void clear() { tagsDetected = false; elements.clear(); translatedEntry = null; textInstance = null; } // ////////////////////////////////////////////////////////////////////////// // Dealing with source here // ////////////////////////////////////////////////////////////////////////// private boolean tagsAggregationEnabled = false; /** * Whether the "first translatable" and "last translatable" tags were * detected. They are the first starting tag that has its ending in the * paragraph and the last ending tag that has its beginning in the * paragraph, respectively. */ private boolean tagsDetected = false; private int firstGood; /** Returns index of the "first translatable" tag. */ private int getFirstGood() { detectAndEnumerateTags(); return firstGood; } private int lastGood; /** Returns index of the "last translatable" tag. */ private int getLastGood() { detectAndEnumerateTags(); return lastGood; } private Text textInstance = null; /** Returns an instance of {@link Text} class used to populate this entry. */ private Text getTextInstance() { detectAndEnumerateTags(); return textInstance; } /** * Detects the first and the last translatable tags and assigns all tags in * translatable region the shortcuts. Basically calls {@link #detectTags()} * and {@link #enumerateTags(int, int)} if tags were not detected, i.e. * {@link #tagsDetected} is false. in the paragraph "last translatable". */ private void detectAndEnumerateTags() { if (!tagsDetected) { if (tagsAggregationEnabled) { aggregateTags(); } detectTags(); tagsDetected = true; enumerateTags(getFirstGood(), getLastGood()); } } public void resetTagDetected() { tagsDetected = false; } /** * Aggregate tags. The current OpenXML filter finds too many tags, usually * causing what users call the "tag soup". Tags aggregation can help * alleviate this problem, but can sometimes lead to semantic issues. * Aggregation is OK only as a temporary hack, until we improve the OpenXML * filter. */ private void aggregateTags() { List<Element> newElements = new ArrayList<Element>(); AggregatedTag aggregated = null; for (Element elem : elements) { if (elem instanceof Tag) { // Add this tag to the aggregated tag if (aggregated == null) { aggregated = new AggregatedTag("tag", null, Tag.Type.ALONE, new Attributes()); } aggregated.add((Tag) elem); } else { /* * This element is not a tag: - add previous aggregated tag (if * any) - add this element */ if (aggregated != null) { newElements.add(aggregated); aggregated = null; } newElements.add(elem); } } // Check if there is remaining aggregated tag if (aggregated != null) { newElements.add(aggregated); aggregated = null; } // Copy everything to elements elements.clear(); for (Element elem : newElements) { elements.add(elem); } newElements.clear(); } /** * Detects the first starting tag that has its ending in the paragraph * "first translatable" and the last ending tag that has its beginning in * the paragraph "last translatable". */ private void detectTags() { // first, detecting if we have any text and where we have it int textStart = -1; for (int i = 0; i < size(); i++) { Element elem = get(i); if ((elem instanceof Text) && ((Text) elem).isMeaningful()) { textStart = i; break; } if (elem instanceof XMLContentBasedTag) { textStart = i; } } for (int i = 0; i < size(); i++) { Element elem = get(i); if ((elem instanceof Text) && ((Text) elem).isMeaningful()) { textInstance = (Text) elem; break; } } if (textStart < 0) { // we have no translatable text in the whole entry firstGood = -1; lastGood = -2; textInstance = null; return; } int textEnd = textStart; for (int i = size() - 1; i >= 0; i--) { Element elem = get(i); if ((elem instanceof Text) && ((Text) elem).isMeaningful()) { textEnd = i; break; } } // if content-based tag is inside text, then expand text into paired content-based tag for (int i = textStart; i <= textEnd; i++) { Element elem = get(i); if (elem instanceof XMLContentBasedTag) { XMLContentBasedTag tag = (XMLContentBasedTag) elem; if (tag.getTag().equals("bpt") || tag.getTag().equals("ept")) { // find id of paired tag String id = StringUtil.nvl(tag.getAttribute("rid"), tag.getAttribute("id"), tag.getAttribute("i")); if (id == null) { continue; } // find paired tag before for (int j = textStart - 1; j >= 0; j--) { if (get(j) instanceof XMLContentBasedTag) { XMLContentBasedTag tag2 = (XMLContentBasedTag) get(j); if (tag2.getTag().equals("bpt") || tag2.getTag().equals("ept")) { // find id of paired tag String id2 = StringUtil.nvl(tag2.getAttribute("rid"), tag2.getAttribute("id"), tag2.getAttribute("i")); if (id.equals(id2)) { textStart = j; } } } } // find paired tag after for (int j = textEnd + 1; j < size(); j++) { if (get(j) instanceof XMLContentBasedTag) { XMLContentBasedTag tag2 = (XMLContentBasedTag) get(j); if (tag2.getTag().equals("bpt") || tag2.getTag().equals("ept")) { // find id of paired tag String id2 = StringUtil.nvl(tag2.getAttribute("rid"), tag2.getAttribute("id"), tag2.getAttribute("i")); if (id.equals(id2)) { textEnd = j; } } } } } } } // ////////////////////////////////////////////////////////////////////// // "first good" // detecting the first starting tag that has its ending in the paragraph boolean found = false; for (firstGood = 0; firstGood < textStart; firstGood++) { Element goodElem = get(firstGood); if (!(goodElem instanceof Tag)) { continue; } Tag good = (Tag) goodElem; if (Tag.Type.BEGIN != good.getType()) { continue; } // trying to test int recursion = 1; for (int i = firstGood + 1; i < textEnd; i++) { Element candElement = get(i); if (candElement instanceof Tag) { Tag cand = (Tag) candElement; if (cand.getTag().equals(good.getTag())) { if (Tag.Type.BEGIN == cand.getType()) { recursion++; } else if (Tag.Type.END == cand.getType()) { recursion--; if (recursion == 0) { if (i > textStart) { found = true; } break; } } } } } // if we could find an ending, this is a "good one" if (found) { break; } } if (!found) { firstGood = textStart; } // ////////////////////////////////////////////////////////////////////// // "last good" // detecting the last ending tag that has its starting in the paragraph found = false; for (lastGood = size() - 1; lastGood > textEnd; lastGood--) { Element goodElem = get(lastGood); if (!(goodElem instanceof Tag)) { continue; } Tag good = (Tag) goodElem; if (Tag.Type.END != good.getType()) { continue; } // trying to test int recursion = 1; for (int i = lastGood - 1; i > textStart; i--) { Element candElement = get(i); if (candElement instanceof Tag) { Tag cand = (Tag) candElement; if (cand.getTag().equals(good.getTag())) { if (Tag.Type.END == cand.getType()) { recursion++; } else if (Tag.Type.BEGIN == cand.getType()) { recursion--; if (recursion == 0) { if (i < textEnd) { found = true; } break; } } } } } // if we coud find a starting, this is a "good one" if (found) { break; } } if (!found) { lastGood = textEnd; } boolean removeTags; if (handler.getContext().isRemoveAllTags()) { // If Remove Tags is on, removeTags = true; // Remove leading and trailing tags must be on } else { removeTags = Core.getFilterMaster().getConfig().isRemoveTags(); } // tags was already removed - restore they if need if (!removeTags) { for (int i = firstGood - 1; i >= 0; i--) { Element elem = get(i); if (elem instanceof Tag) { if (handler.isParagraphTag((Tag) elem)) { break; } firstGood = i; } } for (int i = lastGood + 1; i < size(); i++) { Element elem = get(i); if (elem instanceof Tag) { if (handler.isParagraphTag((Tag) elem)) { break; } lastGood = i; } } } boolean removeSpacesAround = Core.getFilterMaster().getConfig().isRemoveSpacesNonseg(); // spaces was already removed - restore they if need if (!removeSpacesAround) { for (int i = firstGood - 1; i >= 0; i--) { Element elem = get(i); if (elem instanceof Tag) { if (handler.isParagraphTag((Tag) elem)) { break; } } if ((elem instanceof Text) && !((Text) elem).isMeaningful()) { firstGood = i; } } for (int i = lastGood + 1; i < size(); i++) { Element elem = get(i); if (elem instanceof Tag) { if (handler.isParagraphTag((Tag) elem)) { break; } } if ((elem instanceof Text) && !((Text) elem).isMeaningful()) { lastGood = i; } } } } /** * Enumerates tags to be properly shortcut. */ private void enumerateTags(int firstGood, int lastGood) { int n = 0; for (int i = firstGood; i <= lastGood; i++) { Element elem = get(i); if (elem instanceof Tag) { Tag tag = (Tag) elem; if (Tag.Type.ALONE == tag.getType() || Tag.Type.BEGIN == tag.getType()) { tag.setIndex(n); n++; } else if (Tag.Type.END == tag.getType()) { tag.setIndex(-1); // indication of an error // trying to lookup for appropriate starting tag int recursion = 1; for (int j = i - 1; j >= firstGood; j--) { Element otherElem = get(j); if (otherElem instanceof Tag) { Tag other = (Tag) otherElem; if (other.getTag().equals(tag.getTag())) { if (Tag.Type.END == other.getType()) { recursion++; } else if (Tag.Type.BEGIN == other.getType()) { recursion--; if (recursion == 0) { tag.setIndex(other.getIndex()); break; } } } } } if (tag.getIndex() < 0) { // ending tag without a starting one tag.setIndex(n); n++; } } } } } /** * Returns shortcut string representation of the entry source. This is what * the user translates. E.g. for * <code>Here's <b>bold text</b></code> should return * <code>Here's <b0>bold text</b0></code>. * * @param tagsAggregation * Whether tags of this entry can be aggregated. * @param xmlDialect * dialect for processing shortcuts * @param shortcutDetails * shortcuts details */ public String sourceToShortcut(boolean tagsAggregation, XMLDialect xmlDialect, List<ProtectedPart> protectedParts) { if (tagsAggregation != this.tagsAggregationEnabled) { this.tagsAggregationEnabled = tagsAggregation; // Each change to tags aggregation setting resets detected tags tagsDetected = false; } if (getFirstGood() <= getLastGood()) { return xmlDialect.constructShortcuts(elements.subList(getFirstGood(), getLastGood() + 1), protectedParts); } else { return ""; } } private String sourceToShortcut(XMLDialect xmlDialect, List<ProtectedPart> protectedParts) { return sourceToShortcut(tagsAggregationEnabled, xmlDialect, protectedParts); } /** * Returns long XML-encoded representation of the source entry for storing * in TMX. E.g. for <code>Here's <b>bold text</b></code> should * return <code>Here's <bpt i="0">&b0&gt;</bpt>bold * text<ept i="0">&lt;/b0&gt;</ept></code>. */ public String sourceToTMX() { StringBuilder buf = new StringBuilder(); for (int i = 0; i < size(); i++) { buf.append(get(i).toTMX()); } return buf.toString(); } /** * Returns the entry source in its original form as it was in original * document. E.g. for <code>Here's <b>bold text</b></code> * should return the same string * <code>Here's <b>bold text</b></code>. */ public String sourceToOriginal() { StringBuilder buf = new StringBuilder(); for (int i = 0; i < size(); i++) { buf.append(get(i).toOriginal()); } return buf.toString(); } // ////////////////////////////////////////////////////////////////////////// // Dealing with translation // ////////////////////////////////////////////////////////////////////////// Entry translatedEntry = null; /** * Sets the translation of the shortcut string returned by * {@link #toShortcut()}. Before setting translation checks whether the * translation contains all the same tags in weakly correct order: * <ul> * <li>All the tags present in source must be present in translation. For * example, <code>It's <b>bold</b> text</code> should <b>not</b> * be translated as <code>Etot tekst poluzhirnyi</code>. * <li>End tag goes after corresponding beginning tag. For example, * <code>It's <b>bold</b> text</code> should <b>not</b> be * translated as <code>Etot tekst </b>poluzhirnyi<b></code>. * <li>If standalone tag or tag pair was enclosed in another tag pair in * source, it should be enclosed in translation. For example, * <code>It's <b>bold and <i>bold italic</i></b> text</code> * should <b>not</b> be translated as * <code>Etot tekst <b>poluzhirnyi</b> i <i>naklonnyi</i></code>. * <li>Independent standalone tags and tag pairs may be reordered within * entry. For example, * <code>It's <b>bold</b> and <i>italic</i> text</code> * <b>can</b> be translated as * <code>Etot tekst <i>naklonnyi</i> i <b>poluzhirnyi</b></code>. * </ul> * * @throws TranslationException * -- if any tag is missing or tags are ordered incorrectly. */ public void setTranslation(String translation, XMLDialect xmlDialect, List<ProtectedPart> protectedParts) throws TranslationException { if (!sourceToShortcut(xmlDialect, protectedParts).equals(translation)) { checkAndRecoverTags(translation, protectedParts); } } /** * Before setting translation checks whether the translation contains all * the same tags in weakly correct order. See * {@link #setTranslation(String, XMLDialect, List)} for details. */ private void checkAndRecoverTags(String translation, List<ProtectedPart> protectedParts) throws TranslationException { translatedEntry = new Entry(xmlDialect, handler); // ///////////////////////////////////////////////////////////////////// // recovering tags List<TagUtil.Tag> shortTags = TagUtil.buildTagList(translation, protectedParts.toArray(new ProtectedPart[protectedParts.size()])); int pos = 0; for (TagUtil.Tag shortTag : shortTags) { if (pos < shortTag.pos) { translatedEntry.add(createTextInstance(translation.substring(pos, shortTag.pos))); pos = shortTag.pos; } for (int j = getFirstGood(); j <= getLastGood(); j++) { Element longElem = get(j); if (longElem instanceof Tag) { Tag longTag = (Tag) longElem; if (longTag.toShortcut().equals(shortTag.tag)) { translatedEntry.add(longTag); pos += shortTag.tag.length(); break; } } } // P.S. If shortcut tag isn't found, probably we should issue a // warning. } if (pos < translation.length()) { translatedEntry.add(createTextInstance(translation.substring(pos))); } // ///////////////////////////////////////////////////////////////////// // checking tags // TODO: implement checking } private Text createTextInstance(String str) { Text text = getTextInstance(); if (text != null) { return text.createInstance(str); } else { return new XMLText(str, false); } } /** * Returns long XML-encoded representation of the entry translation for * storing in TMX. */ public String translationToTMX() { if (translatedEntry == null) { return sourceToTMX(); } StringBuilder buf = new StringBuilder(); for (int i = 0; i < getFirstGood(); i++) { buf.append(get(i).toTMX()); } buf.append(translatedEntry.sourceToTMX()); for (int i = getLastGood() + 1; i < size(); i++) { buf.append(get(i).toTMX()); } return buf.toString(); } /** * Returns the translated entry as it should be stored in translated * document. */ public String translationToOriginal() { if (translatedEntry == null) { return sourceToOriginal(); } StringBuilder buf = new StringBuilder(); for (int i = 0; i < getFirstGood(); i++) { buf.append(get(i).toOriginal()); } buf.append(translatedEntry.sourceToOriginal()); for (int i = getLastGood() + 1; i < size(); i++) { buf.append(get(i).toOriginal()); } return buf.toString(); } // ///////////////////////////////////////////////////////////////////////// // List of EntryElement objects. // ///////////////////////////////////////////////////////////////////////// /** Elements (tags and text) of this entry. */ private List<Element> elements = new ArrayList<Element>(); /** * Adds an element to this entry. Can be either a {@link Text} or a * {@link Tag}. */ public void add(Element elem) { elements.add(elem); tagsDetected = false; // each addition of the new entry resets detected // tags } /** Removes an element from this entry. */ public void remove(int index) { elements.remove(index); tagsDetected = false; // each deletion of the entry resets detected tags } /** Gets an element. Can be either a {@link Text} or a {@link Tag}. */ public Element get(int i) { return elements.get(i); } /** Returns the number of source elements. */ public int size() { return elements.size(); } /** Returns whether or not the elements list is empty. */ public boolean isEmpty() { return elements.isEmpty(); } }