HistoryPredictor.java example

Explorer
OmegaT-master
/**************************************************************************
 OmegaT - Computer Assisted Translation (CAT) tool
          with fuzzy matching, translation memory, keyword search,
          glossaries, and translation leveraging into updated projects.

 Copyright (C) 2016 Aaron Madlon-Kay
               Home page: http://www.omegat.org/
               Support center: http://groups.yahoo.com/group/OmegaT/

 This file is part of OmegaT.

 OmegaT is free software: you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation, either version 3 of the License, or
 (at your option) any later version.

 OmegaT is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.

 You should have received a copy of the GNU General Public License
 along with this program.  If not, see <http://www.gnu.org/licenses/>.
 **************************************************************************/

package org.omegat.gui.editor.history;

import java.util.Collections;
import java.util.List;
import java.util.logging.Logger;
import java.util.stream.Collectors;

import org.omegat.core.Core;
import org.omegat.core.CoreEvents;
import org.omegat.core.data.SourceTextEntry;
import org.omegat.core.data.TMXEntry;
import org.omegat.core.events.IEntryEventListener;
import org.omegat.core.events.IProjectEventListener.PROJECT_CHANGE_TYPE;
import org.omegat.gui.editor.autocompleter.AutoCompleterItem;
import org.omegat.gui.editor.autocompleter.AutoCompleterListView;
import org.omegat.tokenizer.ITokenizer.StemmingMode;
import org.omegat.util.OStrings;
import org.omegat.util.Preferences;

public class HistoryPredictor extends AutoCompleterListView {

    private static final Logger LOGGER = Logger.getLogger(HistoryPredictor.class.getName());

    WordPredictor predictor = new WordPredictor();
    private SourceTextEntry currentEntry;
    private boolean isCurrentEntryTranslated;

    public HistoryPredictor() {
        super(OStrings.getString("AC_HISTORY_PREDICTIONS_VIEW"));

        CoreEvents.registerProjectChangeListener(eventType -> {
            if (isEnabled() && eventType == PROJECT_CHANGE_TYPE.LOAD) {
                train();
            }
        });
        CoreEvents.registerEntryEventListener(new IEntryEventListener() {
            @Override
            public void onNewFile(String activeFileName) {
            }
            @Override
            public void onEntryActivated(SourceTextEntry newEntry) {
                if (!isEnabled()) {
                    return;
                }
                SourceTextEntry lastEntry = currentEntry;
                boolean wasTranslated = isCurrentEntryTranslated;
                if (lastEntry != null && !wasTranslated) {
                    TMXEntry newTranslation = Core.getProject().getTranslationInfo(lastEntry);
                    if (newTranslation.isTranslated()) {
                        trainString(newTranslation.translation);
                    }
                }
                currentEntry = newEntry;
                isCurrentEntryTranslated = Core.getProject().getTranslationInfo(newEntry).isTranslated();
            }
        });
        Preferences.addPropertyChangeListener(Preferences.AC_HISTORY_PREDICTION_ENABLED, evt -> {
            if ((Boolean) evt.getNewValue()) {
                if (Core.getProject().isProjectLoaded()) {
                    train();
                }
            } else {
                predictor.reset();
            }
        });
    }

    synchronized void train() {
        long start = System.currentTimeMillis();
        predictor.reset();
        Core.getProject().iterateByDefaultTranslations((source, trans) -> trainString(trans.translation));
        Core.getProject().iterateByMultipleTranslations((source, trans) -> trainString(trans.translation));
        long time = System.currentTimeMillis() - start;
        LOGGER.finer(() -> String.format("Time to train History Predictor: %d ms", time));
    }

    private void trainString(String text) {
        if (text == null) {
            return;
        }
        String[] tokens = getTokenizer().tokenizeWordsToStrings(text, StemmingMode.NONE);

        predictor.train(tokens);
    }

    @Override
    public List<AutoCompleterItem> computeListData(String prevText, boolean contextualOnly) {
        if (prevText == null || prevText.isEmpty()) {
            return Collections.emptyList();
        }

        String[] tokens = getTokenizer().tokenizeVerbatimToStrings(prevText);

        String seed = lastFullWordToken(tokens);
        if (seed.isEmpty()) {
            return Collections.emptyList();
        }

        List<AutoCompleterItem> predictions = predictor.predictWord(seed).stream().map(p -> {
            return new AutoCompleterItem(p.getWord(),
                    new String[] { String.valueOf(Math.round(p.getFrequency())) + "%" }, 0);
        }).collect(Collectors.toList());

        if (predictions.isEmpty()) {
            return predictions;
        }

        // We have a non-space-delimited language, so it's not possible to
        // distinguish between a new-word situation and a completion situation.
        if (!isLanguageSpaceDelimited()) {
            return predictions;
        }

        // We are starting a new word so all predictions are relevant
        if (tokens[tokens.length - 1].trim().isEmpty()) {
            return predictions;
        }

        // We have context to filter on
        String context = tokens[tokens.length - 1];
        return predictions.stream().filter(item -> item.payload.startsWith(context) && !item.payload.equals(context))
                .map(item -> new AutoCompleterItem(item.payload, item.extras, context.length()))
                .collect(Collectors.toList());
    }

    /**
     * Find the last <em>completed</em> word.
     * <p>
     * If the language is space-delimited, that means ignoring the last token
     * (which should be a partially input word) and then iterating backwards to
     * find the first non-whitespace token.
     * <p>
     * If the language is not space-delimited, use the last token, as we have no
     * way of distinguishing a completed word from an incomplete one.
     *
     * @param tokens
     * @return
     */
    private String lastFullWordToken(String[] tokens) {
        int startOffset = isLanguageSpaceDelimited() ? 2 : 1;
        for (int i = tokens.length - startOffset; i >= 0; i--) {
            String token = tokens[i];
            if (!token.trim().isEmpty()) {
                return token;
            }
        }
        return "";
    }

    @Override
    public String itemToString(AutoCompleterItem item) {
        return "<html>" + item.payload + " <font color=\"gray\">(" + item.extras[0] + ")</font></html>";
    }

    @Override
    protected boolean isEnabled() {
        return Preferences.isPreference(Preferences.AC_HISTORY_PREDICTION_ENABLED);
    }

    private boolean isLanguageSpaceDelimited() {
        return getTargetLanguage().isSpaceDelimited();
    }
}