DefaultTokenizer.java example

Explorer
OmegaT-master
/**************************************************************************
 OmegaT - Computer Assisted Translation (CAT) tool
          with fuzzy matching, translation memory, keyword search,
          glossaries, and translation leveraging into updated projects.

 Copyright (C) 2000-2006 Keith Godfrey, Maxym Mykhalchuk, and Henry Pijffers
               2007 Didier Briel, Zoltan Bartko
               2008 Alex Buloichik
               2015 Didier Briel, Aaron Madlon-Kay
               Home page: http://www.omegat.org/
               Support center: http://groups.yahoo.com/group/OmegaT/

 This file is part of OmegaT.

 OmegaT is free software: you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation, either version 3 of the License, or
 (at your option) any later version.

 OmegaT is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.

 You should have received a copy of the GNU General Public License
 along with this program.  If not, see <http://www.gnu.org/licenses/>.
 **************************************************************************/

package org.omegat.tokenizer;

import java.text.BreakIterator;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.omegat.core.CoreEvents;
import org.omegat.core.events.IProjectEventListener;
import org.omegat.util.PatternConsts;
import org.omegat.util.StringUtil;
import org.omegat.util.Token;

/**
 * Methods for tokenize string.
 *
 * @author Keith Godfrey
 * @author Maxym Mykhalchuk
 * @author Henry Pijffers (henry.pijffers@saxnot.com)
 * @author Didier Briel
 * @author Zoltan Bartko - bartkozoltan@bartkozoltan.com
 * @author Alex Buloichik
 * @author Aaron Madlon-Kay
 */
public class DefaultTokenizer implements ITokenizer {

    /**
     * Contains a list of tokens for each *unique* string. By not storing a list
     * of tokens for every string, memory is saved. Token lists are not saved
     * when all tokens are requested. Again to save memory.
     */
    private static Map<String, Token[]> tokenCache = new HashMap<String, Token[]>(5000);

    private static final Token[] EMPTY_TOKENS_LIST = new Token[0];
    private static final String[] EMPTY_STRINGS_LIST = new String[0];

    public DefaultTokenizer() {
        CoreEvents.registerProjectChangeListener(new IProjectEventListener() {
            public void onProjectChanged(PROJECT_CHANGE_TYPE eventType) {
                if (eventType == PROJECT_CHANGE_TYPE.CLOSE) {
                    // clear cache
                    synchronized (tokenCache) {
                        tokenCache.clear();
                    }
                }
            }
        });
    }

    /**
     * {@inheritDoc}
     */
    public Token[] tokenizeWords(final String strOrig, final StemmingMode stemmingMode) {
        if (StringUtil.isEmpty(strOrig)) {
            return EMPTY_TOKENS_LIST;
        }

        Token[] result;
        synchronized (tokenCache) {
            result = tokenCache.get(strOrig);
        }
        if (result != null) {
            return result;
        }

        result = tokenizeTextNoCache(strOrig, false);

        // put result in the cache
        synchronized (tokenCache) {
            tokenCache.put(strOrig, result);
        }
        return result;
    }

    @Override
    public String[] tokenizeWordsToStrings(String str, StemmingMode stemmingMode) {
        if (StringUtil.isEmpty(str)) {
            return EMPTY_STRINGS_LIST;
        }
        return tokenizeTextToStringsNoCache(str, false);
    }

    @Override
    public Token[] tokenizeVerbatim(final String strOrig) {
        return tokenizeTextNoCache(strOrig, true);
    }

    @Override
    public String[] tokenizeVerbatimToStrings(String str) {
        return tokenizeTextToStringsNoCache(str, true);
    }

    /**
     * Breaks a string into tokens.
     * <p>
     * Examples:
     * <ul>
     * <li>This is a semi-good way. -> "this", "is", "a", "semi-good", "way"
     * <li>Fine, thanks, and you? -> "fine", "thanks", "and", "you"
     * <li>C&all this action -> "call", "this", "action" ('&' is eaten)
     * </ul>
     * <p>
     * OmegaT tags and other non-word tokens are skipped if the parameter "all"
     * is false.
     *
     * @param str
     *            string to tokenize
     * @param all
     *            If true, numbers, tags, and other non-word tokens are included
     *            in the list
     * @return array of tokens (all)
     */
    private static Token[] tokenizeTextNoCache(final String strOrig, final boolean all) {
        if (StringUtil.isEmpty(strOrig)) {
            // fixes bug nr. 1382810 (StringIndexOutOfBoundsException)
            return EMPTY_TOKENS_LIST;
        }

        // create a new token list
        List<Token> tokens = new ArrayList<Token>(64);

        // get a word breaker
        BreakIterator breaker = getWordBreaker();
        breaker.setText(strOrig);

        int start = breaker.first();
        for (int end = breaker.next(); end != BreakIterator.DONE; start = end, end = breaker.next()) {
            String tokenStr = strOrig.substring(start, end);
            if (all) {
                // Accepting all tokens
                tokens.add(new Token(tokenStr, start));
                continue;
            }
            // Accepting only words that aren't OmegaT tags
            boolean word = false;
            for (int cp, i = 0; i < tokenStr.length(); i += Character.charCount(cp)) {
                cp = tokenStr.codePointAt(i);
                if (Character.isLetter(cp)) {
                    word = true;
                    break;
                }
            }
            if (word && !PatternConsts.OMEGAT_TAG.matcher(tokenStr).matches()) {
                tokens.add(new Token(tokenStr, start));
            }
        }

        return tokens.toArray(new Token[tokens.size()]);
    }

    private static String[] tokenizeTextToStringsNoCache(String str, boolean all) {
        if (StringUtil.isEmpty(str)) {
            return EMPTY_STRINGS_LIST;
        }

        // create a new token list
        List<String> tokens = new ArrayList<String>(64);

        // get a word breaker
        BreakIterator breaker = getWordBreaker();
        breaker.setText(str);

        int start = breaker.first();
        for (int end = breaker.next(); end != BreakIterator.DONE; start = end, end = breaker.next()) {
            String tokenStr = str.substring(start, end);
            if (all) {
                // Accepting all tokens
                tokens.add(tokenStr);
                continue;
            }
            // Accepting only words that aren't OmegaT tags
            boolean word = false;
            for (int cp, i = 0; i < tokenStr.length(); i += Character.charCount(cp)) {
                cp = tokenStr.codePointAt(i);
                if (Character.isLetter(cp)) {
                    word = true;
                    break;
                }
            }
            if (word && !PatternConsts.OMEGAT_TAG.matcher(tokenStr).matches()) {
                tokens.add(tokenStr);
            }
        }

        return tokens.toArray(new String[tokens.size()]);
    }

    /** Returns an iterator to break sentences into words. */
    public static BreakIterator getWordBreaker() {
        // if (wordBreaker==null)
        // wordBreaker = new WordIterator();
        // return wordBreaker;

        return new WordIterator();

        // HP: This is a fix for bug 1589484. If you use only one
        // WordIterator instance, it will lead to problems when
        // using multiple threads, as OmegaT does. Sometimes, in
        // the middle of breaking a string, another thread may set
        // a different text, and then you get index out of bounds
        // exceptions. By returning a new WordIterator each time
        // one is requested, this problem is solved, and it doesn't
        // hurt performance either.
    }

    /**
     * Check if array contains token.
     */
    public static boolean isContains(Token[] tokensList, Token tokenForCheck) {
        for (Token t : tokensList) {
            if (tokenForCheck.equals(t)) {
                return true;
            }
        }
        return false;
    }

    /**
     * Check if array contains other array.
     * @param tokensList a list of tokens to be searched
     * @param listForFind a list of tokens to search in tokensList
     * @param notExact is true if the tokens in listForFind can be non-contiguous or in a different order in the
     * tokensList. If false, tokens must be exactly the same.
     * @return true if the tokens in listForFind are found in tokensList
     */
    public static boolean isContainsAll(Token[] tokensList, Token[] listForFind, boolean notExact) {
        if (notExact) {
            for (Token t : listForFind) {
                if (!isContains(tokensList, t)) {
                    return false;
                }
            }
            return true;
        } else {
            return isContainsExact(tokensList, listForFind);
        }
    }

    /**
     * Check if a list of tokens is found contiguously in another list of tokens
     * @param tokensList a list of tokens to be searched
     * @param listForFind a list of tokens to search in tokensList
     * @return true if the tokens in listForFind are found contiguously in tokensList
     */
    private static boolean isContainsExact(Token[] tokensList, Token[] listForFind) {
        for (int i=0; i<tokensList.length; i++) { // For all tokens in the searched strings
            if (tokensList[i].equals(listForFind[0])) { // We found the first position of listForFind
                if (listForFind.length == 1) { // Only one token, and we found it
                    return true;
                }
                int k = i+1;
                if (listForFind.length <= tokensList.length-k+1) { // Enough words remain to match tokensList
                    boolean found = true;
                    for (int j=1; j<listForFind.length; j++) {
                        if (!listForFind[j].equals(tokensList[k])) { // One of the other tokens doesn't match
                            found = false;
                            break;
                        }
                        k++;
                    }
                    if (found) {  // All tokens matched
                        return true;
                    }
                 } else {
                    return false;
                 }
            }
        }
        return false;
    }

    @Override
    public String[] getSupportedLanguages() {
        return new String[0];
    }
}