/**************************************************************************
OmegaT - Computer Assisted Translation (CAT) tool
with fuzzy matching, translation memory, keyword search,
glossaries, and translation leveraging into updated projects.
Copyright (C) 2016 Aaron Madlon-Kay
Home page: http://www.omegat.org/
Support center: http://groups.yahoo.com/group/OmegaT/
This file is part of OmegaT.
OmegaT is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
OmegaT is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
**************************************************************************/
package org.omegat.gui.editor.history;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Objects;
import java.util.stream.Collectors;
public class WordPredictor {
static final double MIN_FREQUENCY = 10d;
static final private Comparator<Prediction> RESULT_SORTER = Comparator.comparing(Prediction::getFrequency)
.reversed().thenComparing(Prediction::getWord);
private Map<String, FrequencyStrings> data = new HashMap<>();
public void reset() {
data.clear();
}
public void train(String[] tokens) {
if (tokens.length == 0) {
return;
}
for (int i = 0; i < tokens.length - 1; i++) {
String token = tokens[i];
FrequencyStrings strings = data.get(token);
if (strings == null) {
strings = new FrequencyStrings();
data.put(token, strings);
}
strings.encounter(tokens[i + 1]);
}
}
public List<Prediction> predictWord(String seed) {
if (seed == null) {
throw new NullPointerException("Prediction seed can't be null");
}
if (data.isEmpty() || seed.isEmpty()) {
return Collections.emptyList();
}
FrequencyStrings candidates = data.get(seed);
if (candidates == null) {
return Collections.emptyList();
}
// Only consider candidates that have appeared more than once.
List<Entry<String, Integer>> entries = candidates.getEntries().stream().filter(e -> e.getValue() > 1)
.collect(Collectors.toList());
int total = entries.stream().mapToInt(Entry::getValue).sum();
return entries.stream().map(e -> {
double percent = ((double) e.getValue() / total) * 100;
// Only retain predictions meeting the minimum frequency.
return percent >= MIN_FREQUENCY ? new Prediction(e.getKey(), percent) : null;
}).filter(Objects::nonNull).sorted(RESULT_SORTER).collect(Collectors.toList());
}
private static class FrequencyStrings {
private final Map<String, Integer> map = new HashMap<>();
public void encounter(String string) {
Integer count = map.get(string);
map.put(string, count == null ? 1 : count + 1);
}
public List<Entry<String, Integer>> getEntries() {
return new ArrayList<>(map.entrySet());
}
}
public static class Prediction {
private final String word;
private final double frequency;
public Prediction(String word, double frequency) {
this.word = word;
this.frequency = frequency;
}
public String getWord() {
return word;
}
public double getFrequency() {
return frequency;
}
}
}