XLIFFDialect.java example

Explorer
OmegaT-master
/**************************************************************************
 OmegaT - Computer Assisted Translation (CAT) tool
          with fuzzy matching, translation memory, keyword search,
          glossaries, and translation leveraging into updated projects.

 Copyright (C) 2000-2006 Keith Godfrey and Maxym Mykhalchuk
               2007-2010 Didier Briel
               2013 Alex Buloichik, Didier Briel, Piotr Kulik
               2014 Didier Briel, Aaron Madlon-Kay, Piotr Kulik
               Home page: http://www.omegat.org/
               Support center: http://groups.yahoo.com/group/OmegaT/

 This file is part of OmegaT.

 OmegaT is free software: you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation, either version 3 of the License, or
 (at your option) any later version.

 OmegaT is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.

 You should have received a copy of the GNU General Public License
 along with this program.  If not, see <http://www.gnu.org/licenses/>.
 **************************************************************************/

package org.omegat.filters3.xml.xliff;

import java.util.List;

import org.omegat.core.data.ProtectedPart;
import org.omegat.core.statistics.StatisticsSettings;
import org.omegat.filters3.Attributes;
import org.omegat.filters3.Element;
import org.omegat.filters3.Tag;
import org.omegat.filters3.xml.DefaultXMLDialect;
import org.omegat.filters3.xml.XMLContentBasedTag;
import org.omegat.filters3.xml.XMLText;
import org.omegat.filters3.xml.xliff.XLIFFOptions.ID_TYPE;
import org.omegat.util.InlineTagHandler;
import org.omegat.util.StaticUtils;
import org.omegat.util.StringUtil;

/**
 * This class specifies XLIFF XML Dialect.
 *
 * XLIFF 1.2 specification:
 * http://docs.oasis-open.org/xliff/xliff-core/xliff-core.html
 *
 * @author Didier Briel
 * @author Alex Buloichik (alex73mail@gmail.com)
 * @author Piotr Kulik
 * @author Aaron Madlon-Kay
 */
public class XLIFFDialect extends DefaultXMLDialect {
    private boolean forceShortCutToF;
    private boolean ignoreTypeForPhTags;
    private boolean ignoreTypeForBptTags;
    /**
     * Sets whether alternative translations are identified by previous and next paragraphs or by <trans-unit> ID
    */
    protected ID_TYPE altTransIDType;

    public XLIFFDialect() {
    }

    /**
     * Actually defines the dialect. It cannot be done during creation, because
     * options are not known at that step.
     */
    public void defineDialect(XLIFFOptions options) {

        defineParagraphTags(new String[] { "source", "target", });

        defineOutOfTurnTags(new String[] { "sub", });

        if (options.get26Compatibility()) { // Old tag handling compatible with 2.6
            defineIntactTags(new String[] { "source", "header", "bin-unit", "prop-group", "count-group",
                    "alt-trans", "note",
                    "ph", "bpt", "ept", "it", "context", "seg-source", "sdl:seg-defs"});

        } else { // New tag handling
            defineIntactTags(new String[] { "source", "header", "bin-unit", "prop-group", "count-group",
                    "alt-trans", "note",
                    "context", "seg-source", "sdl:seg-defs"});

            defineContentBasedTag("bpt", Tag.Type.BEGIN);
            defineContentBasedTag("ept", Tag.Type.END);
            defineContentBasedTag("it", Tag.Type.ALONE);
            defineContentBasedTag("ph", Tag.Type.ALONE);
            // "mrk", only <mrk mtype="protected"> is content-based tag. see validateContentBasedTag

            forceShortCutToF = options.getForceShortcutToF();
            ignoreTypeForPhTags = options.getIgnoreTypeForPhTags();
            ignoreTypeForBptTags = options.getIgnoreTypeForBptTags();
            altTransIDType = options.getAltTransIDType();
        }

    }

    /**
     * In the XLIFF filter, the tag <mrk> is a preformat tag when the
     * attribute "mtype" contains "seg".
     *
     * @param tag
     *            An XML tag
     * @param atts
     *            The attributes associated with the tag
     * @return <code>true</code> if this tag should be a preformat tag,
     *         <code>false</code> otherwise
     */
    @Override
    public Boolean validatePreformatTag(String tag, Attributes atts) {
        if (!tag.equalsIgnoreCase("mrk")) {
            return false;
        }
        if (atts != null) {
            if ("seg".equalsIgnoreCase(atts.getValueByName("mtype"))) {
                return true;
            }
        }
        return false;
    }

    /**
     * In the XLKIFF filter, content shouldn't be translated if translate="no"
     * http://docs.oasis-open.org/xliff/v1.2/os/xliff-core.html#translate
     * @param tag
     *            An XML tag
     * @param atts
     *            The attributes associated with the tag
     * @return <code>false</code> if the content of this tag should be
     *         translated, <code>true</code> otherwise
     */
    @Override
    public Boolean validateIntactTag(String tag, Attributes atts) {
        if (!tag.equalsIgnoreCase("group")     // Translate can only appear in these tags
            && !tag.equalsIgnoreCase("trans-unit")
            && !tag.equalsIgnoreCase("bin-unit")) {
            return false;
        }

        if (atts != null) {
            if ("no".equalsIgnoreCase(atts.getValueByName("translate"))) {
                return true;
            }
        }
        return false;
    }

    @Override
    public Boolean validateContentBasedTag(String tag, Attributes atts) {
        return "mrk".equals(tag) && atts != null && "protected".equals(atts.getValueByName("mtype"));
    }

    @Override
    public String constructShortcuts(List<Element> elements, List<ProtectedPart> protectedParts) {
        protectedParts.clear();
        // create shortcuts
        InlineTagHandler tagHandler = new InlineTagHandler();

        StringBuilder r = new StringBuilder();
        for (Element el : elements) {
            if (el instanceof XMLContentBasedTag) {
                XMLContentBasedTag tag = (XMLContentBasedTag) el;
                String shortcut = null;
                int shortcutLetter;
                int tagIndex;
                boolean tagProtected;
                if ("bpt".equals(tag.getTag())) {
                    // XLIFF specification requires 'rid' and 'id' attributes,
                    // but some tools uses 'i' attribute like for TMX
                    tagHandler.startBPT(tag.getAttribute("rid"), tag.getAttribute("id"), tag.getAttribute("i"));
                    shortcutLetter = calcTagShortcutLetter(tag, ignoreTypeForBptTags);
                    tagHandler.setTagShortcutLetter(shortcutLetter);
                    tagIndex = tagHandler.endBPT();
                    shortcut = "<" + (shortcutLetter != 0 ? String.valueOf(Character.toChars(shortcutLetter)) : 'f')
                            + tagIndex + '>';
                    tagProtected = false;
                } else if ("ept".equals(tag.getTag())) {
                    tagHandler.startEPT(tag.getAttribute("rid"), tag.getAttribute("id"), tag.getAttribute("i"));
                    tagIndex = tagHandler.endEPT();
                    shortcutLetter = tagHandler.getTagShortcutLetter();
                    shortcut = "</" + (shortcutLetter != 0 ? String.valueOf(Character.toChars(shortcutLetter)) : 'f')
                            + tagIndex + '>';
                    tagProtected = false;
                } else if ("it".equals(tag.getTag())) {
                    tagHandler.startOTHER();
                    tagHandler.setCurrentPos(tag.getAttribute("pos"));
                    tagIndex = tagHandler.endOTHER();
                    // XLIFF specification requires 'open/close' values,
                    // but some tools may use 'begin/end' values like for TMX
                    shortcutLetter = calcTagShortcutLetter(tag);
                    if ("close".equals(tagHandler.getCurrentPos()) || "end".equals(tagHandler.getCurrentPos())) {
                        // In some cases, even if we're able to compute a shortcut, it's better to force to "f"
                        // for better compatibility with corresponding TMX files
                        if (forceShortCutToF) {
                            shortcutLetter = 'f';
                        }
                        shortcut = "</"
                                + (shortcutLetter != 0 ? String.valueOf(Character.toChars(shortcutLetter)) : 'f')
                                + tagIndex + '>';
                    } else {
                        shortcut = "<" + (shortcutLetter != 0 ? String.valueOf(Character.toChars(shortcutLetter)) : 'f')
                                + tagIndex + '>';
                    }
                    tagProtected = false;
                } else if ("ph".equals(tag.getTag())) {
                    tagHandler.startOTHER();
                    tagIndex = tagHandler.endOTHER();
                    shortcutLetter = calcTagShortcutLetter(tag, ignoreTypeForPhTags);
                    shortcut = "<" + (shortcutLetter != 0 ? String.valueOf(Character.toChars(shortcutLetter)) : 'f')
                            + tagIndex + "/>";
                    tagProtected = false;
                } else if ("mrk".equals(tag.getTag())) {
                    tagHandler.startOTHER();
                    tagIndex = tagHandler.endOTHER();
                    shortcutLetter = 'm';
                    shortcut = "<m" + tagIndex + ">" + tag.getIntactContents().sourceToOriginal() + "</m" + tagIndex
                            + ">";
                    tagProtected = true;
                } else {
                    shortcutLetter = 'f';
                    tagIndex = -1;
                    tagProtected = false;
                }
                tag.setShortcutLetter(shortcutLetter);
                tag.setShortcutIndex(tagIndex);
                tag.setShortcut(shortcut);
                r.append(shortcut);
                ProtectedPart pp = new ProtectedPart();
                pp.setTextInSourceSegment(shortcut);
                pp.setDetailsFromSourceFile(tag.toOriginal());
                if (tagProtected) {
                    // protected text with related tags, like <m0>Acme</m0>
                    if (StatisticsSettings.isCountingProtectedText()) {
                        // Protected texts are counted, but related tags are not counted in the word count
                        pp.setReplacementWordsCountCalculation(StaticUtils.TAG_REPLACEMENT
                                + tag.getIntactContents().sourceToOriginal() + StaticUtils.TAG_REPLACEMENT);
                    } else {
                        // All protected parts are not counted in the word count(default)
                        pp.setReplacementWordsCountCalculation(StaticUtils.TAG_REPLACEMENT);
                    }
                    pp.setReplacementUniquenessCalculation(StaticUtils.TAG_REPLACEMENT);
                    pp.setReplacementMatchCalculation(tag.getIntactContents().sourceToOriginal());
                } else {
                    // simple tag, like <i0>
                    if (StatisticsSettings.isCountingStandardTags()) {
                        pp.setReplacementWordsCountCalculation(tag.toSafeCalcShortcut());
                    } else {
                        pp.setReplacementWordsCountCalculation(StaticUtils.TAG_REPLACEMENT);
                    }
                    pp.setReplacementUniquenessCalculation(StaticUtils.TAG_REPLACEMENT);
                    pp.setReplacementMatchCalculation(StaticUtils.TAG_REPLACEMENT);
                }
                protectedParts.add(pp);
            } else if (el instanceof Tag) {
                Tag tag = (Tag) el;
                int tagIndex = tagHandler.paired(tag.getTag(), tag.getType());
                tag.setIndex(tagIndex);
                String shortcut = tag.toShortcut();
                r.append(shortcut);
                ProtectedPart pp = new ProtectedPart();
                pp.setTextInSourceSegment(shortcut);
                pp.setDetailsFromSourceFile(tag.toOriginal());
                if (StatisticsSettings.isCountingStandardTags()) {
                    pp.setReplacementWordsCountCalculation(tag.toSafeCalcShortcut());
                } else {
                    pp.setReplacementWordsCountCalculation(StaticUtils.TAG_REPLACEMENT);
                }
                pp.setReplacementUniquenessCalculation(StaticUtils.TAG_REPLACEMENT);
                pp.setReplacementMatchCalculation(StaticUtils.TAG_REPLACEMENT);
                protectedParts.add(pp);
            } else {
                r.append(el.toShortcut());
            }
        }
        return r.toString();
    }

    private int calcTagShortcutLetter(XMLContentBasedTag tag) {
        return calcTagShortcutLetter(tag, false);
    }

    private int calcTagShortcutLetter(XMLContentBasedTag tag, boolean ignoreTypeForPhtags) {
        int s;
        if (!tag.getIntactContents().isEmpty() && (tag.getIntactContents().get(0) instanceof XMLText)) {
            XMLText xmlText = (XMLText) tag.getIntactContents().get(0);
            s = StringUtil.getFirstLetterLowercase(xmlText.getText());
        } else {
            String type = StringUtil.nvl(tag.getAttribute("ctype"), tag.getAttribute("type"));
            if (type != null && !ignoreTypeForPhtags) {
                s = StringUtil.getFirstLetterLowercase(type);
            } else {
                s = 0;
            }
        }
        return s;
    }
}