ResourceBundleFilter.java example

Explorer
OmegaT-master
/**************************************************************************
 OmegaT - Computer Assisted Translation (CAT) tool
          with fuzzy matching, translation memory, keyword search,
          glossaries, and translation leveraging into updated projects.

 Copyright (C) 2000-2006 Keith Godfrey and Maxym Mykhalchuk
               2009 Alex Buloichik
               2011 Martin Fleurke
               2013-2014 Enrique Estevez, Didier Briel
               2015 Aaron Madlon-Kay, Enrique Estevez
               2016 Aaron Madlon-Kay
               Home page: http://www.omegat.org/
               Support center: http://groups.yahoo.com/group/OmegaT/

 This file is part of OmegaT.

 OmegaT is free software: you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation, either version 3 of the License, or
 (at your option) any later version.

 OmegaT is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.

 You should have received a copy of the GNU General Public License
 along with this program.  If not, see <http://www.gnu.org/licenses/>.
 **************************************************************************/

package org.omegat.filters2.text.bundles;

import java.awt.Window;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.UnsupportedEncodingException;
import java.nio.charset.Charset;
import java.nio.charset.CharsetEncoder;
import java.nio.charset.StandardCharsets;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.omegat.core.data.ProtectedPart;
import org.omegat.filters2.AbstractFilter;
import org.omegat.filters2.FilterContext;
import org.omegat.filters2.Instance;
import org.omegat.filters2.TranslationException;
import org.omegat.util.LinebreakPreservingReader;
import org.omegat.util.Log;
import org.omegat.util.NullBufferedWriter;
import org.omegat.util.OStrings;
import org.omegat.util.PatternConsts;
import org.omegat.util.StringUtil;
import org.omegat.util.TagUtil;

/**
 * Filter to support Java Resource Bundles - the files that are used to I18ze
 * Java applications.
 *
 * @author Maxym Mykhalchuk
 * @author Keith Godfrey
 * @author Alex Buloichik (alex73mail@gmail.com)
 * @author Martin Fleurke
 * @author Enrique Estevez (keko.gl@gmail.com)
 * @author Didier Briel
 * @author Aaron Madlon-Kay
 *
 * Option to remove untranslated segments in the target files
 * Code adapted from the file: MozillaDTDFilter.java
 * Support for encoding outside the ASCII encoding. The management depends of the user.
 * The user have to choose the encoding of the file, source and target.
 * The default is ASCII, which corresponds to the standard behaviour: in that case, any character above 127 is encoded
 * according to the specifications of the bundle files. If another character set is chosen, no encoding takes place
 * and it's up to the user to select a charset compatible with the characters used.
 * "auto" for the target encoding is considered as being ASCII.
 *
 * Support for the comments into the Comments panel (localization notes).
 * Optionally can leave Unicode literals (\\uXXXX) unescaped.
 */
public class ResourceBundleFilter extends AbstractFilter {

    /**
     * Key=value pairs with a preceding comment containing this string are not
     * translated, and are output verbatim.
     * <p>
     * TODO: Make this optional
     */
    public static final String DO_NOT_TRANSLATE_COMMENT = "NOI18N";

    public static final String OPTION_REMOVE_STRINGS_UNTRANSLATED = "unremoveStringsUntranslated";
    public static final String OPTION_DONT_UNESCAPE_U_LITERALS = "dontUnescapeULiterals";
    public static final String DEFAULT_TARGET_ENCODING = StandardCharsets.US_ASCII.name();

    protected Map<String, String> align;

    private String targetEncoding = DEFAULT_TARGET_ENCODING;

    /**
     * If true, will remove non-translated segments in the target files
     */
    private boolean removeStringsUntranslated = false;

    /**
     * If true, will not convert characters into \\uXXXX notation
     */
    private boolean dontUnescapeULiterals = false;

    @Override
    public String getFileFormatName() {
        return OStrings.getString("RBFILTER_FILTER_NAME");
    }

    /**
     *
     * @return true, because it is possible to change source encoding
     */
    @Override
    public boolean isSourceEncodingVariable() {
        return true;
    }

    /**
     *
     * @return true, because it is possible to change target encoding
     */
    @Override
    public boolean isTargetEncodingVariable() {
        return true;
    }

    /**
     * The default encoding is OConsts.ASCII
    */
    @Override
    public Instance[] getDefaultInstances() {
        return new Instance[] { new Instance("*.properties", StandardCharsets.US_ASCII.name(),
                StandardCharsets.US_ASCII.name(), TFP_NAMEONLY + "_"
                + TFP_TARGET_LOCALE + "." + TFP_EXTENSION) };
    }

    /**
     * Creating an output stream to save a localized resource bundle.
     * <p>
     * NOTE: the name of localized resource bundle is different from the name of
     * original one. e.g. "Bundle.properties" -> Russian =
     * "Bundle_ru.properties"
     */
    @Override
    public BufferedWriter createWriter(File outfile, String encoding) throws UnsupportedEncodingException,
            IOException {
        if (encoding != null) {
            targetEncoding = encoding;
        }
        return new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outfile), targetEncoding));
    }

    @Override
    protected String getOutputEncoding(FilterContext fc) {
        String encoding = fc.getOutEncoding();
        // Use default if the user didn't specify anything ("<auto>")
        return encoding == null ? DEFAULT_TARGET_ENCODING : encoding;
    }

    /**
     * Processes an input line for use in OmegaT by doing the following:
     * <ul>
     * <li>Converts ASCII-encoded \\uxxxx chars to normal characters.
     * <li>Converts \r, \n and \t to CR, line feed and tab.
     * <li>But! Keeps a backspace in '\ ', '\=', '\:', etc. (non-trimmable space
     * or non-key-value-breaking equals).
     * <ul>
     */
    protected String normalizeInputLine(String line) throws IOException, TranslationException {

        // Whitespace at the beginning of lines is ignored
        boolean strippingWhitespace = true;
        StringBuilder result = new StringBuilder(line.length());
        for (int cp, len = line.length(), i = 0; i < len; i += Character.charCount(cp)) {
            cp = line.codePointAt(i);
            if (strippingWhitespace && (strippingWhitespace = Character.isWhitespace(cp))) {
                continue;
            }
            if (cp == '\\' && line.codePointCount(i, len) > 1) {
                i += Character.charCount(cp);
                cp = line.codePointAt(i);
                if (cp != 'u') {
                    if (cp == 'n') {
                        cp = '\n';
                    } else if (cp == 'r') {
                        cp = '\r';
                    } else if (cp == 't') {
                        cp = '\t';
                    } else {
                        result.append('\\');
                    }
                } else if (dontUnescapeULiterals) {
                    // Put back the \ we swallowed
                    result.append('\\');
                } else {
                    // checking if the string is long enough
                    if (line.codePointCount(i, len) < 1 + 4) {
                        throw new TranslationException(OStrings.getString("RBFH_ERROR_ILLEGAL_U_SEQUENCE"));
                    }
                    int uStart = line.offsetByCodePoints(i, 1);
                    int uEnd = line.offsetByCodePoints(uStart, 4);
                    String uStr = line.substring(uStart, uEnd);
                    try {
                        cp = Integer.parseInt(uStr, 16);
                        if (!Character.isValidCodePoint(cp)) {
                            throw new TranslationException(OStrings.getString("RBFH_ERROR_ILLEGAL_U_SEQUENCE"));
                        }
                        i = uEnd - Character.charCount(cp);
                    } catch (NumberFormatException ex) {
                        throw new TranslationException(OStrings.getString("RBFH_ERROR_ILLEGAL_U_SEQUENCE"), ex);
                    }
                }
            }
            result.appendCodePoint(cp);
        }

        return result.toString();
    }

    private enum EscapeMode {
        KEY, VALUE, COMMENT
    }

    /**
     * Converts normal strings to ascii-encoded ones.
     *
     * @param text
     *            Text to convert.
     * @param key
     *            Whether it's a key of the key-value pair (' ', ':', '=' MUST
     *            be escaped in a key and MAY be escaped in value, but we don't
     *            escape these).
     * @param encodingAscii
     *            If false, keep the text in the source encoding (if assume what
     *            it is UTF-8, what is the another supported encoding)
     */
    private String toAscii(String text, EscapeMode mode) {
        CharsetEncoder charsetEncoder = Charset.forName(targetEncoding).newEncoder();

        StringBuilder result = new StringBuilder();

        for (int cp, len = text.length(), i = 0; i < len; i += Character.charCount(cp)) {
            cp = text.codePointAt(i);
            if (mode != EscapeMode.COMMENT && cp == '\\') {
                if (dontUnescapeULiterals && containsUEscapeAt(text, i)) {
                    result.append("\\");
                } else {
                    result.append("\\\\");
                }
            } else if (cp == '\n') {
                result.append("\\n");
            } else if (cp == '\r') {
                result.append("\\r");
            } else if (cp == '\t') {
                result.append("\\t");
            } else if (mode == EscapeMode.KEY && cp == ' ') {
                result.append("\\ ");
            } else if (mode == EscapeMode.KEY && cp == '=') {
                result.append("\\=");
            } else if (mode == EscapeMode.KEY && cp == ':') {
                result.append("\\:");
            } else if ((cp >= 32 && cp < 127) || charsetEncoder.canEncode(text.substring(i, i + Character.charCount(cp)))) {
                result.appendCodePoint(cp);
            } else {
                for (char c : Character.toChars(cp)) {
                    String code = Integer.toString(c, 16);
                    while (code.codePointCount(0, code.length()) < 4) {
                        code = '0' + code;
                    }
                    result.append("\\u" + code);
                }
            }
        }

        return result.toString();

    }

    private static boolean containsUEscapeAt(String text, int offset) {
        if (text.codePointCount(offset, text.length()) < 1 + 1 + 4) {
            return false;
        }
        if (text.codePointAt(text.offsetByCodePoints(offset, 1)) != 'u') {
            return false;
        }
        int uStart = text.offsetByCodePoints(offset, 2);
        int uEnd = text.offsetByCodePoints(uStart, 4);
        String uStr = text.substring(uStart, uEnd);
        try {
            int uChr = Integer.parseInt(uStr, 16);
            return Character.isValidCodePoint(uChr);
        } catch (NumberFormatException ex) {
            return false;
        }
    }

    /**
     * Removes extra slashes from, e.g. "\ ", "\=" and "\:" typical in
     * machine-generated resource bundles. A slash at the end of a string means
     * a mandatory space has been trimmed.
     *
     * @see <a href="https://sourceforge.net/p/omegat/bugs/266/">bug #266</a>
     */
    private String removeExtraSlashes(String string) {
        StringBuilder result = new StringBuilder(string.length());
        for (int cp, len = string.length(), i = 0; i < len; i += Character.charCount(cp)) {
            cp = string.codePointAt(i);
            if (cp == '\\') {
                if (dontUnescapeULiterals && containsUEscapeAt(string, i)) {
                    // Don't remove \ before \\uXXXX if we are not unescaping
                } else if (string.codePointCount(i, len) > 1) {
                    // Fix for [ 1812183 ] Properties: space before "=" shouldn't
                    // be part of the key, contributed by Arno Peters
                    i += Character.charCount(cp);
                    cp = string.codePointAt(i);
                } else {
                    cp = ' ';
                }
            }
            result.appendCodePoint(cp);
        }
        return result.toString();
    }

    /**
     * Doing the processing of the file...
     */
    @Override
    public void processFile(BufferedReader reader, BufferedWriter outfile, FilterContext fc)
            throws IOException, TranslationException {
        // Parameter in the options of filter to customize the target file
        removeStringsUntranslated = processOptions != null
                && "true".equalsIgnoreCase(processOptions.get(OPTION_REMOVE_STRINGS_UNTRANSLATED));

        // Parameter in the options of filter to customize the behavior of the
        // filter
        dontUnescapeULiterals = processOptions != null
                && "true".equalsIgnoreCase(processOptions.get(OPTION_DONT_UNESCAPE_U_LITERALS));

        String raw;
        boolean noi18n = false;
        // Support to show the comments (localization notes) into the Comments
        // panel
        String comments = null;

        LinebreakPreservingReader lbpr = new LinebreakPreservingReader(reader); // fix for bug 1462566
        try {
            while ((raw = lbpr.readLine()) != null) {

                String trimmed = raw.trim();

                // skipping empty strings
                if (trimmed.isEmpty()) {
                    outfile.write(raw);
                    outfile.write(lbpr.getLinebreak());
                    // Delete the comments
                    comments = null;
                    continue;
                }

                // Variable to check if a segment is translated
                boolean translatedSegment = true;

                // We are going to use the content of this line,
                // so trim and unescape
                String processed = normalizeInputLine(raw);

                // skipping comments
                int firstCp = trimmed.codePointAt(0);
                if (firstCp == '#' || firstCp == '!') {
                    outfile.write(toAscii(raw, EscapeMode.COMMENT));
                    outfile.write(lbpr.getLinebreak());
                    // Save the comments
                    comments = (comments == null ? processed : comments + "\n" + processed);
                    // checking if the next string shouldn't be
                    // internationalized
                    if (raw.contains(DO_NOT_TRANSLATE_COMMENT)) {
                        noi18n = true;
                    }
                    continue;
                }

                // reading the glued lines
                while (processed.codePointBefore(processed.length()) == '\\') {
                    String next = lbpr.readLine();
                    if (next == null) {
                        next = "";
                    }
                    // gluing this line (w/o '\' on this line)
                    // with next line (w/o leading spaces)
                    processed = processed.substring(0, processed.offsetByCodePoints(processed.length(), -1))
                            + normalizeInputLine(next);
                }

                // key=value pairs
                int equalsPos = searchEquals(processed);

                // writing out key
                String key;
                if (equalsPos >= 0) {
                    key = processed.substring(0, equalsPos).trim();
                } else {
                    key = processed.trim();
                }
                key = removeExtraSlashes(key);

                // advance if there're spaces or tabs after =
                if (equalsPos >= 0) {
                    int equalsEnd = processed.offsetByCodePoints(equalsPos, 1);
                    while (equalsEnd < processed.length()) {
                        int cp = processed.codePointAt(equalsEnd);
                        if (cp != ' ' && cp != '\t') {
                            break;
                        }
                        equalsEnd += Character.charCount(cp);
                    }
                    String equals = processed.substring(equalsPos, equalsEnd);

                    // value, if any
                    String value;
                    if (equalsEnd < processed.length()) {
                        value = removeExtraSlashes(processed.substring(equalsEnd));
                    } else {
                        value = "";
                    }

                    if (noi18n) {
                        // if we don't need to internationalize
                        outfile.write(toAscii(key, EscapeMode.KEY));
                        outfile.write(equals);
                        outfile.write(toAscii(value, EscapeMode.VALUE));
                        outfile.write(lbpr.getLinebreak());
                        noi18n = false;
                    } else {
                        value = value.replaceAll("\\n\\n", "\n \n");
                        // If there is a comment, show it into the Comments panel
                        String trans = process(key, value, comments);
                        // Delete the comments
                        comments = null;
                        // Check if the segment is not translated
                        if ("--untranslated_yet--".equals(trans)) {
                            translatedSegment = false;
                            trans = value;
                        }
                        trans = trans.replaceAll("\\n\\s\\n", "\n\n");
                        trans = toAscii(trans, EscapeMode.VALUE);
                        if (!trans.isEmpty() && trans.codePointAt(0) == ' ') {
                            trans = '\\' + trans;
                        }
                        // Non-translated segments are written based on the
                        // filter options
                        if (translatedSegment || !removeStringsUntranslated) {
                            outfile.write(toAscii(key, EscapeMode.KEY));
                            outfile.write(equals);
                            outfile.write(trans);
                            outfile.write(lbpr.getLinebreak()); // fix for bug 1462566
                        }
                    }
                }
            }
        } finally {
            lbpr.close();
        }
    }

    /**
     * Looks for the key-value separator (=,: or ' ') in the string.
     *
     * @return The char number of key-value separator in a string. Not that if
     *         the string does not contain any separator this string is
     *         considered to be a key with empty string value, and this method
     *         returns <code>-1</code> to indicate there's no equals.
     * @see <a href="https://sourceforge.net/p/omegat/bugs/266/">bug #266</a>
     */
    private int searchEquals(String str) {
        int prevCp = 'a';
        for (int cp, i = 0; i < str.length(); i += Character.charCount(cp)) {
            cp = str.codePointAt(i);
            if (prevCp != '\\') {
                if (cp == '=' || cp == ':') {
                    return i;
                } else if (cp == ' ' || cp == '\t') {
                    for (int cp2, j = str.offsetByCodePoints(i, 1); j < str.length(); j += Character.charCount(cp2)) {
                        cp2 = str.codePointAt(j);
                        if (cp2 == ':' || cp2 == '=') {
                            return j;
                        }
                        if (cp2 != ' ' && cp2 != '\t') {
                            return i;
                        }
                    }
                    return i;
                }
            }
            prevCp = cp;
        }
        return -1;
    }

    // Support to show the comments (localization notes) into the Comments panel
    // Added the c parameter, of type String, which is the comment showed in the interface
    protected String process(String key, String value, String c) {
        if (entryParseCallback != null) {
            List<ProtectedPart> protectedParts = TagUtil.applyCustomProtectedParts(value,
                    PatternConsts.SIMPLE_JAVA_MESSAGEFORMAT_PATTERN_VARS, null);
            entryParseCallback.addEntry(key, value, null, false, c, null, this, protectedParts);
            return value;
        } else if (entryTranslateCallback != null) {
            String trans = entryTranslateCallback.getTranslation(key, value, null);
            return trans != null ? trans : "--untranslated_yet--";
        } else if (entryAlignCallback != null) {
            align.put(key, value);
        }
        return value;
    }

    @Override
    protected void alignFile(BufferedReader sourceFile, BufferedReader translatedFile, org.omegat.filters2.FilterContext fc) throws Exception {
        Map<String, String> source = new HashMap<String, String>();
        Map<String, String> translated = new HashMap<String, String>();

        align = source;
        processFile(sourceFile, new NullBufferedWriter(), fc);
        align = translated;
        processFile(translatedFile, new NullBufferedWriter(), fc);
        for (Map.Entry<String, String> en : source.entrySet()) {
            String tr = translated.get(en.getKey());
            if (!StringUtil.isEmpty(tr)) {
                entryAlignCallback.addTranslation(en.getKey(), en.getValue(), tr, false, null, this);
            }
        }
    }

    @Override
    public Map<String, String> changeOptions(Window parent, Map<String, String> config) {
        try {
            ResourceBundleOptionsDialog dialog = new ResourceBundleOptionsDialog(parent, config);
            dialog.setVisible(true);
            if (ResourceBundleOptionsDialog.RET_OK == dialog.getReturnStatus())
                return dialog.getOptions();
            else
                return null;
        } catch (Exception e) {
            Log.log(OStrings.getString("RB_FILTER_EXCEPTION"));
            Log.log(e);
            return null;
        }
    }

    /**
     * Returns true to indicate that Java Resource Bundles filter has options.
     *
     */
    @Override
    public boolean hasOptions() {
        return true;
    }

}