BaseTokenizer.java example

Explorer
OmegaT-master
/**************************************************************************
 OmegaT - Computer Assisted Translation (CAT) tool
          with fuzzy matching, translation memory, keyword search,
          glossaries, and translation leveraging into updated projects.

 Copyright (C) 2008 Alex Buloichik (alex73mail@gmail.com)
               2013, 2015 Aaron Madlon-Kay
               Home page: http://www.omegat.org/
               Support center: http://groups.yahoo.com/group/OmegaT/

 This file is part of OmegaT.

 OmegaT is free software: you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation, either version 3 of the License, or
 (at your option) any later version.

 OmegaT is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.

 You should have received a copy of the GNU General Public License
 along with this program.  If not, see <http://www.gnu.org/licenses/>.
 **************************************************************************/
package org.omegat.tokenizer;

import java.io.IOException;
import java.io.StringReader;
import java.text.BreakIterator;
import java.util.ArrayList;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;

import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.omegat.core.Core;
import org.omegat.core.CoreEvents;
import org.omegat.core.data.IProject;
import org.omegat.core.data.SourceTextEntry;
import org.omegat.core.events.IProjectEventListener;
import org.omegat.gui.comments.ICommentProvider;
import org.omegat.util.Language;
import org.omegat.util.Log;
import org.omegat.util.StringUtil;
import org.omegat.util.Token;

/**
 * Base class for Lucene-based tokenizers.
 *
 * @author Alex Buloichik (alex73mail@gmail.com)
 * @author Aaron Madlon-Kay
 */
public abstract class BaseTokenizer implements ITokenizer {

    private static final Map<String, Token[]> tokenCacheNone = new ConcurrentHashMap<String, Token[]>(5000);
    private static final Map<String, Token[]> tokenCacheMatching = new ConcurrentHashMap<String, Token[]>(5000);
    private static final Map<String, Token[]> tokenCacheGlossary = new ConcurrentHashMap<String, Token[]>(5000);

    protected static final String[] EMPTY_STRING_LIST = new String[0];
    protected static final Token[] EMPTY_TOKENS_LIST = new Token[0];
    protected static final int DEFAULT_TOKENS_COUNT = 64;

    /**
     * Indicates that {@link #tokenizeVerbatim(String)} should use OmegaT's
     * {@link WordIterator} to tokenize "exactly" for display.
     * <p>
     * For language-specific tokenizers that maintain the property that
     * <code>(the concatenation of all tokens).equals(original string) == true</code>,
     * set this to false to use the language-specific tokenizer for everything.
     */
    protected boolean shouldDelegateTokenizeExactly = true;

    public BaseTokenizer() {
        CoreEvents.registerProjectChangeListener(new IProjectEventListener() {
            @Override
            public void onProjectChanged(PROJECT_CHANGE_TYPE eventType) {
                if (eventType == PROJECT_CHANGE_TYPE.CLOSE) {
                    tokenCacheNone.clear();
                    tokenCacheMatching.clear();
                    tokenCacheGlossary.clear();
                }
            }
        });
    }

    /**
     * {@inheritDoc}
     */
    @Override
    public Token[] tokenizeWords(final String strOrig, final StemmingMode stemmingMode) {
        Map<String, Token[]> cache;
        switch (stemmingMode) {
        case NONE:
            cache = tokenCacheNone;
            break;
        case GLOSSARY:
            cache = tokenCacheGlossary;
            break;
        case MATCHING:
            cache = tokenCacheMatching;
            break;
        default:
            throw new RuntimeException("No cache for specified stemming mode");
        }
        Token[] result = cache.get(strOrig);
        if (result != null) {
            return result;
        }
        result = tokenize(strOrig,
                stemmingMode == StemmingMode.GLOSSARY || stemmingMode == StemmingMode.MATCHING,
                stemmingMode == StemmingMode.MATCHING,
                stemmingMode != StemmingMode.GLOSSARY,
                true);

        // put result in the cache
        cache.put(strOrig, result);
        return result;
    }

    @Override
    public String[] tokenizeWordsToStrings(String str, StemmingMode stemmingMode) {
        return tokenizeToStrings(str,
                stemmingMode == StemmingMode.GLOSSARY || stemmingMode == StemmingMode.MATCHING,
                stemmingMode == StemmingMode.MATCHING,
                stemmingMode != StemmingMode.GLOSSARY,
                true);
    }

    /**
     * {@inheritDoc}
     */
    @Override
    public Token[] tokenizeVerbatim(final String strOrig) {
        if (StringUtil.isEmpty(strOrig)) {
            return EMPTY_TOKENS_LIST;
        }

        if (!shouldDelegateTokenizeExactly) {
            return tokenize(strOrig, false, false, false, false);
        }

        List<Token> result = new ArrayList<Token>(DEFAULT_TOKENS_COUNT);

        WordIterator iterator = new WordIterator();
        iterator.setText(strOrig);

        int start = iterator.first();
        for (int end = iterator.next(); end != BreakIterator.DONE; start = end,
                end = iterator.next()) {
            String tokenStr = strOrig.substring(start, end);
            result.add(new Token(tokenStr, start));
        }

        return result.toArray(new Token[result.size()]);
    }

    @Override
    public String[] tokenizeVerbatimToStrings(String str) {
        if (StringUtil.isEmpty(str)) {
            return EMPTY_STRING_LIST;
        }

        if (!shouldDelegateTokenizeExactly) {
            return tokenizeToStrings(str, false, false, false, false);
        }

        List<String> result = new ArrayList<String>(DEFAULT_TOKENS_COUNT);

        WordIterator iterator = new WordIterator();
        iterator.setText(str);

        int start = iterator.first();
        for (int end = iterator.next(); end != BreakIterator.DONE; start = end,
                end = iterator.next()) {
            String tokenStr = str.substring(start, end);
            result.add(tokenStr);
        }

        return result.toArray(new String[result.size()]);
    }

    protected Token[] tokenizeByCodePoint(String strOrig) {
        // See http://www.ibm.com/developerworks/library/j-unicode/#1-5
        // Example 1-5 appears to be faster than 1-6 for us (because our strings are short?)
        Token[] tokens = new Token[strOrig.codePointCount(0, strOrig.length())];
        for (int cp, i = 0, j = 0; i < strOrig.length(); i += Character.charCount(cp)) {
            cp = strOrig.codePointAt(i);
            tokens[j++] = new Token(String.valueOf(Character.toChars(cp)), i);
        }
        return tokens;
    }

    protected String[] tokenizeByCodePointToStrings(String strOrig) {
        // See http://www.ibm.com/developerworks/library/j-unicode/#1-5
        // Example 1-5 appears to be faster than 1-6 for us (because our strings are short?)
        String[] tokens = new String[strOrig.codePointCount(0, strOrig.length())];
        for (int cp, i = 0, j = 0; i < strOrig.length(); i += Character.charCount(cp)) {
            cp = strOrig.codePointAt(i);
            tokens[j++] = String.valueOf(Character.toChars(cp));
        }
        return tokens;
    }

    protected Token[] tokenize(final String strOrig, final boolean stemsAllowed, final boolean stopWordsAllowed,
            final boolean filterDigits, final boolean filterWhitespace) {
        if (StringUtil.isEmpty(strOrig)) {
            return EMPTY_TOKENS_LIST;
        }

        List<Token> result = new ArrayList<Token>(64);

        TokenStream in = null;
        try {
            in = getTokenStream(strOrig, stemsAllowed, stopWordsAllowed);
            in.addAttribute(CharTermAttribute.class);
            in.addAttribute(OffsetAttribute.class);

            CharTermAttribute cattr = in.getAttribute(CharTermAttribute.class);
            OffsetAttribute off = in.getAttribute(OffsetAttribute.class);

            in.reset();
            while (in.incrementToken()) {
                String tokenText = cattr.toString();
                if (acceptToken(tokenText, filterDigits, filterWhitespace)) {
                    result.add(new Token(tokenText, off.startOffset(), off.endOffset() - off.startOffset()));
                }
            }
            in.end();
            in.close();
        } catch (IOException ex) {
            Log.log(ex);
        } finally {
            IOUtils.closeQuietly(in);
        }
        return result.toArray(new Token[result.size()]);
    }

    protected String[] tokenizeToStrings(String str, boolean stemsAllowed, boolean stopWordsAllowed,
            boolean filterDigits, boolean filterWhitespace) {
        if (StringUtil.isEmpty(str)) {
            return EMPTY_STRING_LIST;
        }

        List<String> result = new ArrayList<String>(64);

        TokenStream in = null;
        try {
            in = getTokenStream(str, stemsAllowed, stopWordsAllowed);
            in.addAttribute(CharTermAttribute.class);
            in.addAttribute(OffsetAttribute.class);

            CharTermAttribute cattr = in.getAttribute(CharTermAttribute.class);
            OffsetAttribute off = in.getAttribute(OffsetAttribute.class);

            Locale loc = stemsAllowed ? getLanguage().getLocale() : null;

            in.reset();
            while (in.incrementToken()) {
                String tokenText = cattr.toString();
                if (acceptToken(tokenText, filterDigits, filterWhitespace)) {
                    result.add(tokenText);
                    if (stemsAllowed) {
                        String origText = str.substring(off.startOffset(), off.endOffset());
                        if (!origText.toLowerCase(loc).equals(tokenText.toLowerCase(loc))) {
                            result.add(origText);
                        }
                    }
                }
            }
            in.end();
            in.close();
        } catch (IOException ex) {
            Log.log(ex);
        } finally {
            IOUtils.closeQuietly(in);
        }
        return result.toArray(new String[result.size()]);
    }

    private boolean acceptToken(String token, boolean filterDigits, boolean filterWhitespace) {
        if (StringUtil.isEmpty(token)) {
            return false;
        }
        if (!filterDigits && !filterWhitespace) {
            return true;
        }
        boolean isWhitespaceOnly = true;
        for (int i = 0, cp; i < token.length(); i += Character.charCount(cp)) {
            cp = token.codePointAt(i);
            if (filterDigits && Character.isDigit(cp)) {
                return false;
            }
            if (filterWhitespace && !StringUtil.isWhiteSpace(cp)) {
                isWhitespaceOnly = false;
            }
        }
        return !(filterWhitespace && isWhitespaceOnly);
    }

    protected abstract TokenStream getTokenStream(String strOrig, boolean stemsAllowed, boolean stopWordsAllowed)
            throws IOException;

    /**
     * Minimal implementation that returns the default implementation
     * corresponding to all false parameters. Subclasses should override this to
     * handle true parameters.
     */
    protected TokenStream getStandardTokenStream(String strOrig) throws IOException {
        StandardTokenizer tokenizer = new StandardTokenizer();
        tokenizer.setReader(new StringReader(strOrig));
        return tokenizer;
    }

    @Override
    public String[] getSupportedLanguages() {
        Tokenizer ann = getClass().getAnnotation(Tokenizer.class);
        if (ann == null) {
            throw new RuntimeException(getClass().getName() + " must have a "
                    + Tokenizer.class.getName() + " annotation available at runtime.");
        }
        return ann.languages();
    }

    protected Language getLanguage() {
        String[] languages = getSupportedLanguages();
        if (languages.length == 0 || languages[0] == Tokenizer.DISCOVER_AT_RUNTIME) {
            IProject proj = Core.getProject();
            if (proj == null) {
                throw new RuntimeException("This tokenizer's language can only be "
                        + "determined in the context of a project, but project is null.");
            } else if (proj.getSourceTokenizer() == this) {
                return proj.getProjectProperties().getSourceLanguage();
            } else if (proj.getTargetTokenizer() == this) {
                return proj.getProjectProperties().getTargetLanguage();
            } else {
                throw new RuntimeException("This tokenizer's language can only be "
                        + "determined in the context of a project, but is not assigned "
                        + "to current project.");
            }
        }
        return new Language(languages[0]);
    }

    protected String test(String... args) {
        StringBuilder sb = new StringBuilder();
        sb.append(getClass().getName()).append('\n');
        for (String input : args) {
            sb.append("Input:\n");
            sb.append(input).append("\n");
            sb.append("tokenizeVerbatim:\n");
            sb.append(printTest(tokenizeVerbatimToStrings(input), input));
            sb.append("tokenize:\n");
            sb.append(printTest(tokenizeToStrings(input, false, false, false, true), input));
            sb.append("tokenize (stemsAllowed):\n");
            sb.append(printTest(tokenizeToStrings(input, true, false, false, true), input));
            sb.append("tokenize (stemsAllowed stopWordsAllowed):\n");
            sb.append(printTest(tokenizeToStrings(input, true, true, false, true), input));
            sb.append("tokenize (stemsAllowed stopWordsAllowed filterDigits) (=tokenizeWords(MATCHING)):\n");
            sb.append(printTest(tokenizeToStrings(input, true, true, true, true), input));
            sb.append("tokenize (stemsAllowed filterDigits) (=tokenizeWords(GLOSSARY)):\n");
            sb.append(printTest(tokenizeToStrings(input, true, false, true, true), input));
            sb.append("tokenize (filterDigits) (=tokenizeWords(NONE)):\n");
            sb.append(printTest(tokenizeToStrings(input, false, false, true, true), input));
            sb.append("----------------------------------\n");
        }
        return sb.toString();
    }

    protected String printTest(String[] strings, String input) {
        StringBuilder sb = new StringBuilder();
        sb.append(StringUtils.join(strings, ", ")).append('\n');
        sb.append("Is verbatim: ").append(StringUtils.join(strings, "").equals(input)).append('\n');
        return sb.toString();
    }

    public static ICommentProvider TOKENIZER_DEBUG_PROVIDER = new ICommentProvider() {
        @Override
        public String getComment(SourceTextEntry newEntry) {
            return ((BaseTokenizer) Core.getProject().getSourceTokenizer()).test(newEntry.getSrcText());
        }
    };
}