EditorUtils.java example

Explorer
OmegaT-master
/**************************************************************************
 OmegaT - Computer Assisted Translation (CAT) tool
          with fuzzy matching, translation memory, keyword search,
          glossaries, and translation leveraging into updated projects.

 Copyright (C) 2008 Alex Buloichik
               2012 Didier Briel
               2015 Aaron Madlon-Kay
               Home page: http://www.omegat.org/
               Support center: http://groups.yahoo.com/group/OmegaT/

 This file is part of OmegaT.

 OmegaT is free software: you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation, either version 3 of the License, or
 (at your option) any later version.

 OmegaT is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.

 You should have received a copy of the GNU General Public License
 along with this program.  If not, see <http://www.gnu.org/licenses/>.
 **************************************************************************/

package org.omegat.gui.editor;

import java.util.List;
import java.util.Locale;

import javax.swing.text.BadLocationException;
import javax.swing.text.JTextComponent;
import javax.swing.text.Utilities;

import org.omegat.core.Core;
import org.omegat.core.data.ProtectedPart;
import org.omegat.core.data.SourceTextEntry;
import org.omegat.gui.editor.IEditor.CHANGE_CASE_TO;
import org.omegat.gui.glossary.GlossaryEntry;
import org.omegat.tokenizer.ITokenizer;
import org.omegat.util.StringUtil;
import org.omegat.util.TagUtil;
import org.omegat.util.TagUtil.Tag;
import org.omegat.util.Token;

/**
 * Some utilities methods.
 *
 * @author Alex Buloichik (alex73mail@gmail.com)
 * @author Didier Briel
 * @author Aaron Madlon-Kay
 */
public final class EditorUtils {

    private EditorUtils() {
    }

    /**
     * Check if language is Right-To-Left oriented.
     *
     * @param language
     *            ISO-639-2 language code
     * @return true if language is RTL
     */
    public static boolean isRTL(final String language) {
        return "ar".equalsIgnoreCase(language) || "iw".equalsIgnoreCase(language)
                || "he".equalsIgnoreCase(language) || "fa".equalsIgnoreCase(language)
                || "ur".equalsIgnoreCase(language) || "ug".equalsIgnoreCase(language)
                || "ji".equalsIgnoreCase(language) || "yi".equalsIgnoreCase(language);
    }

    /**
     * Check if locale is Right-To-Left oriented.
     * @return true if locale is Right-To-Left oriented.
     */
    public static boolean localeIsRTL() {
        String language = Locale.getDefault().getLanguage().toLowerCase();
        return EditorUtils.isRTL(language);
    }

    /**
     * Determines the start of a word for the given model location. This method
     * skips direction char.
     *
     * TODO: change to use document's locale
     *
     * @param c
     * @param offs
     * @return
     * @throws BadLocationException
     */
    public static int getWordStart(JTextComponent c, int offs) throws BadLocationException {
        int result = Utilities.getWordStart(c, offs);
        char ch = c.getDocument().getText(result, 1).charAt(0);
        if (isDirectionChar(ch)) {
            result++;
        }
        return result;
    }

    /**
     * Determines the end of a word for the given model location. This method
     * skips direction char.
     *
     * TODO: change to use document's locale
     *
     * @param c
     * @param offs
     * @return
     * @throws BadLocationException
     */
    public static int getWordEnd(JTextComponent c, int offs) throws BadLocationException {
        int result = Utilities.getWordEnd(c, offs);
        if (result > 0) {
            char ch = c.getDocument().getText(result - 1, 1).charAt(0);
            if (isDirectionChar(ch)) {
                result--;
            }
        }
        return result;
    }

    /**
     * Check if char is direction char(u202A,u202B,u202C).
     *
     * @param ch
     *            char to check
     * @return true if it's direction char
     */
    private static boolean isDirectionChar(final char ch) {
        return ch == '\u202A' || ch == '\u202B' || ch == '\u202C' || ch == '\u200E' || ch == '\u200F';
    }

    /**
     * Remove invisible direction chars from string.
     *
     * @param text
     *            string with direction chars
     * @return string without direction chars
     */
    public static String removeDirectionChars(String text) {
        return text.replaceAll("[\u202A\u202B\u202C\u200E\u200F]", "");
    }

    /**
     * Remove bidi chars around tags only.
     *
     * @param text
     *            string with direction chars
     * @return string without direction chars
     */
    public static String removeDirectionCharsAroundTags(String text, SourceTextEntry ste) {
        for (ProtectedPart pp : ste.getProtectedParts()) {
            int pos = -1;
            while ((pos = text.indexOf(pp.getTextInSourceSegment(), pos + 1)) >= 0) {
                if (hasBidiAroundTag(text, pp.getTextInSourceSegment(), pos)) {
                    // remove bidi chars around
                    text = text.substring(0, pos - 2) + pp.getTextInSourceSegment()
                            + text.substring(pos + pp.getTextInSourceSegment().length() + 2);
                }
            }
        }
        return text;
    }

    /**
     * Change the case of the input string to the indicated case. When toWhat is
     * {@link CHANGE_CASE_TO#CYCLE} the result will be UPPER > LOWER > SENTENCE
     * > TITLE > UPPER.
     * <p>
     * This is a convenience method for
     * {@link #doChangeCase(String, CHANGE_CASE_TO, Locale, ITokenizer)}. The
     * locale and tokenizer will be taken from the current project's target
     * language values.
     *
     * @param input
     *            The string to change
     * @param toWhat
     *            The case to change to, or {@link CHANGE_CASE_TO#CYCLE}
     * @return The modified string
     */
    public static String doChangeCase(String input, CHANGE_CASE_TO toWhat) {
        Locale locale = Core.getProject().getProjectProperties().getTargetLanguage().getLocale();
        ITokenizer tokenizer = Core.getProject().getTargetTokenizer();
        return doChangeCase(input, toWhat, locale, tokenizer);
    }

    /**
     * Change the case of the input string to the indicated case. When toWhat is
     * {@link CHANGE_CASE_TO#CYCLE} the result will be UPPER > LOWER > SENTENCE
     * > TITLE > UPPER.
     *
     * @param input
     *            The string to change
     * @param toWhat
     *            The case to change to, or {@link CHANGE_CASE_TO#CYCLE}
     * @param locale
     *            The locale of the input string
     * @param tokenizer
     *            A tokenizer for the input string language
     * @return The modified string
     */
    public static String doChangeCase(String input, CHANGE_CASE_TO toWhat, Locale locale, ITokenizer tokenizer) {
        // tokenize the selection
        Token[] tokenList = tokenizer.tokenizeVerbatim(input);

        if (toWhat == CHANGE_CASE_TO.CYCLE) {
            int lower = 0;
            int upper = 0;
            int title = 0;
            int ambiguous = 0; // Maybe title, maybe upper
            int mixed = 0;

            for (Token token : tokenList) {
                String word = token.getTextFromString(input);
                if (!canChangeTokenCase(word)) {
                    continue;
                }
                if (StringUtil.isLowerCase(word)) {
                    lower++;
                    continue;
                }
                boolean isTitle = StringUtil.isTitleCase(word);
                boolean isUpper = StringUtil.isUpperCase(word);
                if (isTitle && isUpper) {
                    ambiguous++;
                    continue;
                }
                if (isTitle) {
                    title++;
                    continue;
                }
                if (isUpper) {
                    upper++;
                    continue;
                }
                if (StringUtil.isMixedCase(word)) {
                    mixed++;
                }
                // Ignore other tokens as they should be caseless text
                // such as CJK ideographs or symbols only.
            }

            if (lower == 0 && title == 0 && upper == 0 && mixed == 0 && ambiguous == 0) {
                return input; // nothing to do here
            }

            toWhat = determineTargetCase(lower, upper, title, mixed, ambiguous);
        }

        StringBuilder buffer = new StringBuilder(input);
        int lengthIncrement = 0;

        for (Token token : tokenList) {
            // find out the case and change to the selected
            String tokText = token.getTextFromString(input);
            if (!canChangeTokenCase(tokText)) {
                continue;
            }
            String result;
            if (toWhat == CHANGE_CASE_TO.LOWER) {
                result = tokText.toLowerCase(locale);
            } else if (toWhat == CHANGE_CASE_TO.UPPER) {
                result = tokText.toUpperCase(locale);
            } else if (toWhat == CHANGE_CASE_TO.TITLE) {
                result = StringUtil.toTitleCase(tokText, locale);
            } else if (toWhat == CHANGE_CASE_TO.SENTENCE) {
                result = StringUtil.toTitleCase(tokText, locale);
                toWhat = CHANGE_CASE_TO.LOWER;
            } else {
                result = tokText;
            }

            // replace this token
            buffer.replace(token.getOffset() + lengthIncrement, token.getLength() + token.getOffset() + lengthIncrement,
                    result);

            lengthIncrement += result.length() - token.getLength();
        }

        return buffer.toString();
    }

    /**
     * Determine whether or not the provided token should be considered when
     * changing the case of a larger string. We don't want to consider, e.g.,
     * OmegaT tags because changing their case can break them.
     *
     * @param token
     * @return Whether or not to change case
     */
    private static boolean canChangeTokenCase(String token) {
        return Character.isLetter(token.codePointAt(0));
    }

    private static CHANGE_CASE_TO determineTargetCase(int lower, int upper, int title, int mixed, int ambiguous) {
        int presentCaseTypes = 0;
        if (lower > 0) {
            presentCaseTypes++;
        }
        if (upper > 0) {
            presentCaseTypes++;
        }
        if (title > 0) {
            presentCaseTypes++;
        }
        if (mixed > 0) {
            presentCaseTypes++;
        }

        if ((title > 0 || ambiguous > 0) && lower > 0 && upper == 0 && mixed == 0) {
            return CHANGE_CASE_TO.TITLE;
        }

        if (mixed > 0 || presentCaseTypes > 1) {
            return CHANGE_CASE_TO.UPPER;
        }

        if (lower > 0) {
            return CHANGE_CASE_TO.SENTENCE;
        }

        if (title > 0) {
            return CHANGE_CASE_TO.UPPER;
        }

        if (upper > 0) {
            return CHANGE_CASE_TO.LOWER;
        }

        if (ambiguous > 0) {
            // If we only have ambiguous tokens then we must go to lower so that we
            // get binary upper/lower switching instead of trinary upper/lower/title.
            return CHANGE_CASE_TO.LOWER;
        }

        // This should only happen if no cases are present, so it doesn't even matter.
        return CHANGE_CASE_TO.UPPER;
    }

    /**
     * Convenience method for {@link #replaceGlossaryEntries(String, List, Locale, ITokenizer)}. Glossary entries are
     * retrieved from {@code GlossaryManager}; the locale and tokenizer are taken from the project's current values for
     * the source language.
     *
     * @param text
     *            Text in which to replace glossary hits. Assumed to be in the project's source language.
     * @return Text with source glossary terms replaced with target terms
     */
    public static String replaceGlossaryEntries(String text) {
        Locale locale = Core.getProject().getProjectProperties().getSourceLanguage().getLocale();
        ITokenizer tokenizer = Core.getProject().getSourceTokenizer();
        return replaceGlossaryEntries(text, Core.getGlossaryManager().getGlossaryEntries(text),
                locale, tokenizer);
    }

    /**
     * Given a list of glossary entries, replace any instances of the source term appearing in the given text with the
     * target term. When there are multiple target terms, the first one is used.
     *
     * @param text
     *            Text in which to replace glossary hits (assumed to be in the project's source language)
     * @param entries
     *            List of glossary entries
     * @param locale
     *            Locale with which to perform capitalization matching (assumed to be source locale)
     * @param tokenizer
     *            Tokenizer with which to split text (assumed to be project's source tokenizer)
     * @return Text with source glossary terms replaced with target terms
     */
    public static String replaceGlossaryEntries(String text, List<GlossaryEntry> entries, Locale locale,
            ITokenizer tokenizer) {
        if (StringUtil.isEmpty(text) || entries == null || entries.isEmpty()) {
            return text;
        }
        StringBuilder sb = new StringBuilder();
        String[] haystack = tokenizer.tokenizeVerbatimToStrings(text);
        for (int i = 0; i < haystack.length; i++) {
            String tok = haystack[i];
            boolean replaced = false;
            for (GlossaryEntry e : entries) {
                String[] needle = tokenizer.tokenizeVerbatimToStrings(e.getSrcText());
                if (tokensPresentAt(needle, haystack, i)) {
                    sb.append(StringUtil.matchCapitalization(e.getLocText(), tok, locale));
                    replaced = true;
                    i += needle.length - 1;
                    break;
                }
            }
            if (!replaced) {
                sb.append(tok);
            }
        }
        return sb.toString();
    }

    private static boolean tokensPresentAt(String[] needle, String[] haystack, int offset) {
        if (offset < 0 || offset + needle.length > haystack.length) {
            return false;
        }
        for (int i = 0; i < needle.length; i++) {
            String hayToken = haystack[i + offset];
            String needleToken = needle[i];
            if (!hayToken.equalsIgnoreCase(needleToken)) {
                return false;
            }
        }
        return true;
    }

    /**
     * Add RTL+LTR around tags. Used for display tags better in RTL text.
     */
    public static String addBidiAroundTags(String text, SourceTextEntry ste) {
        List<Tag> tags = TagUtil.buildTagList(text, ste.getProtectedParts());

        int pos = 0;
        StringBuilder s = new StringBuilder(text.length() * 12 / 10);
        for (Tag t : tags) {
            if (pos < t.pos) {
                s.append(text.substring(pos, t.pos));
            }
            s.append(SegmentBuilder.BIDI_RLM_CHAR);
            s.append(SegmentBuilder.BIDI_LRM_CHAR);
            s.append(t.tag);
            s.append(SegmentBuilder.BIDI_LRM_CHAR);
            s.append(SegmentBuilder.BIDI_RLM_CHAR);
            pos = t.pos + t.tag.length();
        }
        if (pos < text.length()) {
            s.append(text.substring(pos));
        }
        return s.toString();
    }

    public static boolean hasBidiAroundTag(String text, String tag, int pos) {
        try {
            boolean has = true;
            if (text.charAt(pos - 1) != SegmentBuilder.BIDI_LRM_CHAR
                    || text.charAt(pos - 2) != SegmentBuilder.BIDI_RLM_CHAR) {
                has = false;
            }
            if (text.charAt(pos + tag.length()) != SegmentBuilder.BIDI_LRM_CHAR
                    || text.charAt(pos + tag.length() + 1) != SegmentBuilder.BIDI_RLM_CHAR) {
                has = false;
            }
            return has;
        } catch (StringIndexOutOfBoundsException ex) {
            // before or after known string - don't have bidi chars around this tag
            return false;
        }
    }
}