/************************************************************************** OmegaT - Computer Assisted Translation (CAT) tool with fuzzy matching, translation memory, keyword search, glossaries, and translation leveraging into updated projects. Copyright (C) 2008 Alex Buloichik 2012 Didier Briel 2015 Aaron Madlon-Kay Home page: http://www.omegat.org/ Support center: http://groups.yahoo.com/group/OmegaT/ This file is part of OmegaT. OmegaT is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. OmegaT is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>. **************************************************************************/ package org.omegat.gui.editor; import java.util.List; import java.util.Locale; import javax.swing.text.BadLocationException; import javax.swing.text.JTextComponent; import javax.swing.text.Utilities; import org.omegat.core.Core; import org.omegat.core.data.ProtectedPart; import org.omegat.core.data.SourceTextEntry; import org.omegat.gui.editor.IEditor.CHANGE_CASE_TO; import org.omegat.gui.glossary.GlossaryEntry; import org.omegat.tokenizer.ITokenizer; import org.omegat.util.StringUtil; import org.omegat.util.TagUtil; import org.omegat.util.TagUtil.Tag; import org.omegat.util.Token; /** * Some utilities methods. * * @author Alex Buloichik (alex73mail@gmail.com) * @author Didier Briel * @author Aaron Madlon-Kay */ public final class EditorUtils { private EditorUtils() { } /** * Check if language is Right-To-Left oriented. * * @param language * ISO-639-2 language code * @return true if language is RTL */ public static boolean isRTL(final String language) { return "ar".equalsIgnoreCase(language) || "iw".equalsIgnoreCase(language) || "he".equalsIgnoreCase(language) || "fa".equalsIgnoreCase(language) || "ur".equalsIgnoreCase(language) || "ug".equalsIgnoreCase(language) || "ji".equalsIgnoreCase(language) || "yi".equalsIgnoreCase(language); } /** * Check if locale is Right-To-Left oriented. * @return true if locale is Right-To-Left oriented. */ public static boolean localeIsRTL() { String language = Locale.getDefault().getLanguage().toLowerCase(); return EditorUtils.isRTL(language); } /** * Determines the start of a word for the given model location. This method * skips direction char. * * TODO: change to use document's locale * * @param c * @param offs * @return * @throws BadLocationException */ public static int getWordStart(JTextComponent c, int offs) throws BadLocationException { int result = Utilities.getWordStart(c, offs); char ch = c.getDocument().getText(result, 1).charAt(0); if (isDirectionChar(ch)) { result++; } return result; } /** * Determines the end of a word for the given model location. This method * skips direction char. * * TODO: change to use document's locale * * @param c * @param offs * @return * @throws BadLocationException */ public static int getWordEnd(JTextComponent c, int offs) throws BadLocationException { int result = Utilities.getWordEnd(c, offs); if (result > 0) { char ch = c.getDocument().getText(result - 1, 1).charAt(0); if (isDirectionChar(ch)) { result--; } } return result; } /** * Check if char is direction char(u202A,u202B,u202C). * * @param ch * char to check * @return true if it's direction char */ private static boolean isDirectionChar(final char ch) { return ch == '\u202A' || ch == '\u202B' || ch == '\u202C' || ch == '\u200E' || ch == '\u200F'; } /** * Remove invisible direction chars from string. * * @param text * string with direction chars * @return string without direction chars */ public static String removeDirectionChars(String text) { return text.replaceAll("[\u202A\u202B\u202C\u200E\u200F]", ""); } /** * Remove bidi chars around tags only. * * @param text * string with direction chars * @return string without direction chars */ public static String removeDirectionCharsAroundTags(String text, SourceTextEntry ste) { for (ProtectedPart pp : ste.getProtectedParts()) { int pos = -1; while ((pos = text.indexOf(pp.getTextInSourceSegment(), pos + 1)) >= 0) { if (hasBidiAroundTag(text, pp.getTextInSourceSegment(), pos)) { // remove bidi chars around text = text.substring(0, pos - 2) + pp.getTextInSourceSegment() + text.substring(pos + pp.getTextInSourceSegment().length() + 2); } } } return text; } /** * Change the case of the input string to the indicated case. When toWhat is * {@link CHANGE_CASE_TO#CYCLE} the result will be UPPER > LOWER > SENTENCE * > TITLE > UPPER. * <p> * This is a convenience method for * {@link #doChangeCase(String, CHANGE_CASE_TO, Locale, ITokenizer)}. The * locale and tokenizer will be taken from the current project's target * language values. * * @param input * The string to change * @param toWhat * The case to change to, or {@link CHANGE_CASE_TO#CYCLE} * @return The modified string */ public static String doChangeCase(String input, CHANGE_CASE_TO toWhat) { Locale locale = Core.getProject().getProjectProperties().getTargetLanguage().getLocale(); ITokenizer tokenizer = Core.getProject().getTargetTokenizer(); return doChangeCase(input, toWhat, locale, tokenizer); } /** * Change the case of the input string to the indicated case. When toWhat is * {@link CHANGE_CASE_TO#CYCLE} the result will be UPPER > LOWER > SENTENCE * > TITLE > UPPER. * * @param input * The string to change * @param toWhat * The case to change to, or {@link CHANGE_CASE_TO#CYCLE} * @param locale * The locale of the input string * @param tokenizer * A tokenizer for the input string language * @return The modified string */ public static String doChangeCase(String input, CHANGE_CASE_TO toWhat, Locale locale, ITokenizer tokenizer) { // tokenize the selection Token[] tokenList = tokenizer.tokenizeVerbatim(input); if (toWhat == CHANGE_CASE_TO.CYCLE) { int lower = 0; int upper = 0; int title = 0; int ambiguous = 0; // Maybe title, maybe upper int mixed = 0; for (Token token : tokenList) { String word = token.getTextFromString(input); if (!canChangeTokenCase(word)) { continue; } if (StringUtil.isLowerCase(word)) { lower++; continue; } boolean isTitle = StringUtil.isTitleCase(word); boolean isUpper = StringUtil.isUpperCase(word); if (isTitle && isUpper) { ambiguous++; continue; } if (isTitle) { title++; continue; } if (isUpper) { upper++; continue; } if (StringUtil.isMixedCase(word)) { mixed++; } // Ignore other tokens as they should be caseless text // such as CJK ideographs or symbols only. } if (lower == 0 && title == 0 && upper == 0 && mixed == 0 && ambiguous == 0) { return input; // nothing to do here } toWhat = determineTargetCase(lower, upper, title, mixed, ambiguous); } StringBuilder buffer = new StringBuilder(input); int lengthIncrement = 0; for (Token token : tokenList) { // find out the case and change to the selected String tokText = token.getTextFromString(input); if (!canChangeTokenCase(tokText)) { continue; } String result; if (toWhat == CHANGE_CASE_TO.LOWER) { result = tokText.toLowerCase(locale); } else if (toWhat == CHANGE_CASE_TO.UPPER) { result = tokText.toUpperCase(locale); } else if (toWhat == CHANGE_CASE_TO.TITLE) { result = StringUtil.toTitleCase(tokText, locale); } else if (toWhat == CHANGE_CASE_TO.SENTENCE) { result = StringUtil.toTitleCase(tokText, locale); toWhat = CHANGE_CASE_TO.LOWER; } else { result = tokText; } // replace this token buffer.replace(token.getOffset() + lengthIncrement, token.getLength() + token.getOffset() + lengthIncrement, result); lengthIncrement += result.length() - token.getLength(); } return buffer.toString(); } /** * Determine whether or not the provided token should be considered when * changing the case of a larger string. We don't want to consider, e.g., * OmegaT tags because changing their case can break them. * * @param token * @return Whether or not to change case */ private static boolean canChangeTokenCase(String token) { return Character.isLetter(token.codePointAt(0)); } private static CHANGE_CASE_TO determineTargetCase(int lower, int upper, int title, int mixed, int ambiguous) { int presentCaseTypes = 0; if (lower > 0) { presentCaseTypes++; } if (upper > 0) { presentCaseTypes++; } if (title > 0) { presentCaseTypes++; } if (mixed > 0) { presentCaseTypes++; } if ((title > 0 || ambiguous > 0) && lower > 0 && upper == 0 && mixed == 0) { return CHANGE_CASE_TO.TITLE; } if (mixed > 0 || presentCaseTypes > 1) { return CHANGE_CASE_TO.UPPER; } if (lower > 0) { return CHANGE_CASE_TO.SENTENCE; } if (title > 0) { return CHANGE_CASE_TO.UPPER; } if (upper > 0) { return CHANGE_CASE_TO.LOWER; } if (ambiguous > 0) { // If we only have ambiguous tokens then we must go to lower so that we // get binary upper/lower switching instead of trinary upper/lower/title. return CHANGE_CASE_TO.LOWER; } // This should only happen if no cases are present, so it doesn't even matter. return CHANGE_CASE_TO.UPPER; } /** * Convenience method for {@link #replaceGlossaryEntries(String, List, Locale, ITokenizer)}. Glossary entries are * retrieved from {@code GlossaryManager}; the locale and tokenizer are taken from the project's current values for * the source language. * * @param text * Text in which to replace glossary hits. Assumed to be in the project's source language. * @return Text with source glossary terms replaced with target terms */ public static String replaceGlossaryEntries(String text) { Locale locale = Core.getProject().getProjectProperties().getSourceLanguage().getLocale(); ITokenizer tokenizer = Core.getProject().getSourceTokenizer(); return replaceGlossaryEntries(text, Core.getGlossaryManager().getGlossaryEntries(text), locale, tokenizer); } /** * Given a list of glossary entries, replace any instances of the source term appearing in the given text with the * target term. When there are multiple target terms, the first one is used. * * @param text * Text in which to replace glossary hits (assumed to be in the project's source language) * @param entries * List of glossary entries * @param locale * Locale with which to perform capitalization matching (assumed to be source locale) * @param tokenizer * Tokenizer with which to split text (assumed to be project's source tokenizer) * @return Text with source glossary terms replaced with target terms */ public static String replaceGlossaryEntries(String text, List<GlossaryEntry> entries, Locale locale, ITokenizer tokenizer) { if (StringUtil.isEmpty(text) || entries == null || entries.isEmpty()) { return text; } StringBuilder sb = new StringBuilder(); String[] haystack = tokenizer.tokenizeVerbatimToStrings(text); for (int i = 0; i < haystack.length; i++) { String tok = haystack[i]; boolean replaced = false; for (GlossaryEntry e : entries) { String[] needle = tokenizer.tokenizeVerbatimToStrings(e.getSrcText()); if (tokensPresentAt(needle, haystack, i)) { sb.append(StringUtil.matchCapitalization(e.getLocText(), tok, locale)); replaced = true; i += needle.length - 1; break; } } if (!replaced) { sb.append(tok); } } return sb.toString(); } private static boolean tokensPresentAt(String[] needle, String[] haystack, int offset) { if (offset < 0 || offset + needle.length > haystack.length) { return false; } for (int i = 0; i < needle.length; i++) { String hayToken = haystack[i + offset]; String needleToken = needle[i]; if (!hayToken.equalsIgnoreCase(needleToken)) { return false; } } return true; } /** * Add RTL+LTR around tags. Used for display tags better in RTL text. */ public static String addBidiAroundTags(String text, SourceTextEntry ste) { List<Tag> tags = TagUtil.buildTagList(text, ste.getProtectedParts()); int pos = 0; StringBuilder s = new StringBuilder(text.length() * 12 / 10); for (Tag t : tags) { if (pos < t.pos) { s.append(text.substring(pos, t.pos)); } s.append(SegmentBuilder.BIDI_RLM_CHAR); s.append(SegmentBuilder.BIDI_LRM_CHAR); s.append(t.tag); s.append(SegmentBuilder.BIDI_LRM_CHAR); s.append(SegmentBuilder.BIDI_RLM_CHAR); pos = t.pos + t.tag.length(); } if (pos < text.length()) { s.append(text.substring(pos)); } return s.toString(); } public static boolean hasBidiAroundTag(String text, String tag, int pos) { try { boolean has = true; if (text.charAt(pos - 1) != SegmentBuilder.BIDI_LRM_CHAR || text.charAt(pos - 2) != SegmentBuilder.BIDI_RLM_CHAR) { has = false; } if (text.charAt(pos + tag.length()) != SegmentBuilder.BIDI_LRM_CHAR || text.charAt(pos + tag.length() + 1) != SegmentBuilder.BIDI_RLM_CHAR) { has = false; } return has; } catch (StringIndexOutOfBoundsException ex) { // before or after known string - don't have bidi chars around this tag return false; } } }