/**************************************************************************
OmegaT - Computer Assisted Translation (CAT) tool
with fuzzy matching, translation memory, keyword search,
glossaries, and translation leveraging into updated projects.
Copyright (C) 2000-2006 Keith Godfrey and Maxym Mykhalchuk
2008 Alex Buloichik
2012 Thomas Cordonnier, Martin Fleurke
2013 Aaron Madlon-Kay, Alex Buloichik
Home page: http://www.omegat.org/
Support center: http://groups.yahoo.com/group/OmegaT/
This file is part of OmegaT.
OmegaT is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
OmegaT is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
**************************************************************************/
package org.omegat.core.statistics;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Objects;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.omegat.core.Core;
import org.omegat.core.data.EntryKey;
import org.omegat.core.data.ExternalTMX;
import org.omegat.core.data.IProject;
import org.omegat.core.data.IProject.DefaultTranslationsIterator;
import org.omegat.core.data.IProject.MultipleTranslationsIterator;
import org.omegat.core.data.PrepareTMXEntry;
import org.omegat.core.data.SourceTextEntry;
import org.omegat.core.data.TMXEntry;
import org.omegat.core.events.IStopped;
import org.omegat.core.matching.FuzzyMatcher;
import org.omegat.core.matching.ISimilarityCalculator;
import org.omegat.core.matching.LevenshteinDistance;
import org.omegat.core.matching.NearString;
import org.omegat.core.segmentation.Rule;
import org.omegat.tokenizer.ITokenizer;
import org.omegat.util.Language;
import org.omegat.util.OConsts;
import org.omegat.util.OStrings;
import org.omegat.util.PatternConsts;
import org.omegat.util.TMXProp;
import org.omegat.util.Token;
/**
* Class to find matches by specified criteria.
*
* Since we can use stemmers to prepare tokens, we should use 3-pass comparison of similarity. Similarity will
* be calculated in 3 steps:
*
* 1. Split original segment into word-only tokens using stemmer (with stop words list), then compare tokens.
*
* 2. Split original segment into word-only tokens without stemmer, then compare tokens.
*
* 3. Split original segment into not-only-words tokens (including numbers and tags) without stemmer, then
* compare tokens.
*
* This class is not thread safe ! Must be used in the one thread only.
*
* @author Maxym Mykhalchuk
* @author Alex Buloichik (alex73mail@gmail.com)
* @author Martin Fleurke
* @author Aaron Madlon-Kay
*/
public class FindMatches {
/**
* According to gettext source code, PO fuzzies are created above 60%
* https://sourceforge.net/p/omegat/feature-requests/1258/
*/
static final int PENALTY_FOR_FUZZY = 40;
private static final int PENALTY_FOR_REMOVED = 5;
private static final int SUBSEGMENT_MATCH_THRESHOLD = 85;
private static final Pattern SEARCH_FOR_PENALTY = Pattern.compile("penalty-(\\d+)");
private static final String ORPHANED_FILE_NAME = OStrings.getString("CT_ORPHAN_STRINGS");
private final ISimilarityCalculator distance = new LevenshteinDistance();
/**
* the removePattern that was configured by the user.
*/
private final Pattern removePattern = PatternConsts.getRemovePattern();
private final IProject project;
private final ITokenizer tok;
private final Locale srcLocale;
private final int maxCount;
/** Result list. */
private List<NearString> result;
private final boolean searchExactlyTheSame;
private String srcText;
/**
* Text that was removed by the removePattern from the source text.
*/
private String removedText;
/** Tokens for original string, with and without stems. */
private Token[] strTokensStem, strTokensNoStem;
/** Tokens for original string, includes numbers and tags. */
private Token[] strTokensAll;
// This finder used for search separate segment matches
private FindMatches separateSegmentMatcher;
/**
* @param searchExactlyTheSame
* allows to search similarities with the same text as source segment. This mode used only for
* separate sentence match in paragraph project, i.e. where source is just part of current
* source.
*/
public FindMatches(IProject project, int maxCount, boolean allowSeparateSegmentMatch,
boolean searchExactlyTheSame) {
this.project = project;
this.tok = project.getSourceTokenizer();
this.srcLocale = project.getProjectProperties().getSourceLanguage().getLocale();
this.maxCount = maxCount;
this.searchExactlyTheSame = searchExactlyTheSame;
if (allowSeparateSegmentMatch && !project.getProjectProperties().isSentenceSegmentingEnabled()) {
separateSegmentMatcher = new FindMatches(project, 1, false, true);
}
}
public List<NearString> search(final String searchText, final boolean requiresTranslation,
final boolean fillSimilarityData, final IStopped stop) throws StoppedException {
result = new ArrayList<>(OConsts.MAX_NEAR_STRINGS + 1);
srcText = searchText;
removedText = "";
// remove part that is to be removed according to user settings.
// Rationale: it might be a big string influencing the 'editing distance', while it is not really part
// of the translatable text
if (removePattern != null) {
StringBuilder removedBuffer = new StringBuilder();
Matcher removeMatcher = removePattern.matcher(srcText);
while (removeMatcher.find()) {
removedBuffer.append(removeMatcher.group());
}
srcText = removeMatcher.replaceAll("");
removedText = removedBuffer.toString();
}
// get tokens for original string
strTokensStem = tokenizeStem(srcText);
strTokensNoStem = tokenizeNoStem(srcText);
strTokensAll = tokenizeAll(srcText);
/* HP: includes non - word tokens */
// travel by project entries, including orphaned
if (project.getProjectProperties().isSupportDefaultTranslations()) {
project.iterateByDefaultTranslations(new DefaultTranslationsIterator() {
public void iterate(String source, TMXEntry trans) {
checkStopped(stop);
if (!searchExactlyTheSame && source.equals(searchText)) {
// skip original==original entry comparison
return;
}
if (requiresTranslation && trans.translation == null) {
return;
}
String fileName = project.isOrphaned(source) ? ORPHANED_FILE_NAME : null;
processEntry(null, source, trans.translation, NearString.MATCH_SOURCE.MEMORY, false, 0,
fileName, trans.creator, trans.creationDate, trans.changer, trans.changeDate,
null);
}
});
}
project.iterateByMultipleTranslations(new MultipleTranslationsIterator() {
public void iterate(EntryKey source, TMXEntry trans) {
checkStopped(stop);
if (!searchExactlyTheSame && source.sourceText.equals(searchText)) {
// skip original==original entry comparison
return;
}
if (requiresTranslation && trans.translation == null) {
return;
}
String fileName = project.isOrphaned(source) ? ORPHANED_FILE_NAME : null;
processEntry(source, source.sourceText, trans.translation, NearString.MATCH_SOURCE.MEMORY,
false, 0, fileName, trans.creator, trans.creationDate, trans.changer,
trans.changeDate, null);
}
});
// travel by translation memories
for (Map.Entry<String, ExternalTMX> en : project.getTransMemories().entrySet()) {
int penalty = 0;
Matcher matcher = SEARCH_FOR_PENALTY.matcher(en.getKey());
if (matcher.find()) {
penalty = Integer.parseInt(matcher.group(1));
}
for (PrepareTMXEntry tmen : en.getValue().getEntries()) {
checkStopped(stop);
if (requiresTranslation && tmen.translation == null) {
continue;
}
processEntry(null, tmen.source, tmen.translation, NearString.MATCH_SOURCE.TM, false, penalty,
en.getKey(), tmen.creator, tmen.creationDate, tmen.changer, tmen.changeDate,
tmen.otherProperties);
}
}
// travel by all entries for check source file translations
for (SourceTextEntry ste : project.getAllEntries()) {
checkStopped(stop);
if (ste.getSourceTranslation() != null) {
processEntry(ste.getKey(), ste.getSrcText(), ste.getSourceTranslation(),
NearString.MATCH_SOURCE.MEMORY, ste.isSourceTranslationFuzzy(), 0, ste.getKey().file,
"", 0, "", 0, null);
}
}
if (separateSegmentMatcher != null) {
// split paragraph even when segmentation disabled, then find matches for every segment
List<StringBuilder> spaces = new ArrayList<StringBuilder>();
List<Rule> brules = new ArrayList<Rule>();
Language sourceLang = project.getProjectProperties().getSourceLanguage();
Language targetLang = project.getProjectProperties().getTargetLanguage();
List<String> segments = Core.getSegmenter().segment(sourceLang, srcText, spaces, brules);
if (segments.size() > 1) {
List<String> fsrc = new ArrayList<String>(segments.size());
List<String> ftrans = new ArrayList<String>(segments.size());
// multiple segments
for (short i = 0; i < segments.size(); i++) {
String onesrc = segments.get(i);
// find match for separate segment
List<NearString> segmentMatch = separateSegmentMatcher.search(onesrc, requiresTranslation, false,
stop);
if (!segmentMatch.isEmpty()
&& segmentMatch.get(0).scores[0].score >= SUBSEGMENT_MATCH_THRESHOLD) {
fsrc.add(segmentMatch.get(0).source);
ftrans.add(segmentMatch.get(0).translation);
} else {
fsrc.add("");
ftrans.add("");
}
}
// glue found sources
String foundSrc = Core.getSegmenter().glue(sourceLang, sourceLang, fsrc, spaces, brules);
// glue found translations
String foundTrans = Core.getSegmenter().glue(sourceLang, targetLang, ftrans, spaces, brules);
processEntry(null, foundSrc, foundTrans, NearString.MATCH_SOURCE.TM, false, 0, "", "", 0, "",
0, null);
}
}
if (fillSimilarityData) {
// fill similarity data only for result
for (NearString near : result) {
// fix for bug 1586397
byte[] similarityData = FuzzyMatcher.buildSimilarityData(strTokensAll,
tokenizeAll(near.source));
near.attr = similarityData;
}
}
return result;
}
/**
* Compare one entry with original entry.
*
* @param candEntry
* entry to compare
*/
protected void processEntry(final EntryKey key, final String source, final String translation,
NearString.MATCH_SOURCE comesFrom, final boolean fuzzy, final int penalty, final String tmxName,
final String creator, final long creationDate, final String changer, final long changedDate,
final List<TMXProp> props) {
// remove part that is to be removed prior to tokenize
String realSource = source;
int realPenaltyForRemoved = 0;
if (removePattern != null) {
StringBuilder entryRemovedText = new StringBuilder();
Matcher removeMatcher = removePattern.matcher(realSource);
while (removeMatcher.find()) {
entryRemovedText.append(removeMatcher.group());
}
realSource = removeMatcher.replaceAll("");
// calculate penalty if something has been removed, otherwise different strings get 100% match.
if (!entryRemovedText.toString().equals(removedText)) {
// penalty for different 'removed'-part
realPenaltyForRemoved = PENALTY_FOR_REMOVED;
}
}
Token[] candTokens = tokenizeStem(realSource);
// First percent value - with stemming if possible
int similarityStem = FuzzyMatcher.calcSimilarity(distance, strTokensStem, candTokens);
similarityStem -= penalty;
if (fuzzy) {
// penalty for fuzzy
similarityStem -= PENALTY_FOR_FUZZY;
}
similarityStem -= realPenaltyForRemoved;
// check if we have chance by first percentage only
if (!haveChanceToAdd(similarityStem, Integer.MAX_VALUE, Integer.MAX_VALUE)) {
return;
}
Token[] candTokensNoStem = tokenizeNoStem(realSource);
// Second percent value - without stemming
int similarityNoStem = FuzzyMatcher.calcSimilarity(distance, strTokensNoStem, candTokensNoStem);
similarityNoStem -= penalty;
if (fuzzy) {
// penalty for fuzzy
similarityNoStem -= PENALTY_FOR_FUZZY;
}
similarityNoStem -= realPenaltyForRemoved;
// check if we have chance by first and second percentages
if (!haveChanceToAdd(similarityStem, similarityNoStem, Integer.MAX_VALUE)) {
return;
}
Token[] candTokensAll = tokenizeAll(realSource);
// Third percent value - with numbers, tags, etc.
int simAdjusted = FuzzyMatcher.calcSimilarity(distance, strTokensAll, candTokensAll);
simAdjusted -= penalty;
if (fuzzy) {
// penalty for fuzzy
simAdjusted -= PENALTY_FOR_FUZZY;
}
simAdjusted -= realPenaltyForRemoved;
// check if we have chance by first, second and third percentages
if (!haveChanceToAdd(similarityStem, similarityNoStem, simAdjusted)) {
return;
}
addNearString(key, source, translation, comesFrom, fuzzy, similarityStem, similarityNoStem,
simAdjusted, null, tmxName, creator, creationDate, changer, changedDate, props);
}
/**
* Check if entry have a chance to be added to result list. If no, there is no sense to calculate other
* parameters.
*
* @param simStem
* similarity with stemming
* @param simNoStem
* similarity without stemming
* @param simExactly
* exactly similarity
* @return true if we have chance
*/
protected boolean haveChanceToAdd(final int simStem, final int simNoStem, final int simExactly) {
if (simStem < OConsts.FUZZY_MATCH_THRESHOLD && simNoStem < OConsts.FUZZY_MATCH_THRESHOLD) {
return false;
}
if (result.size() < maxCount) {
return true;
}
NearString st = result.get(result.size() - 1);
int chance = Integer.compare(st.scores[0].score, simStem);
if (chance == 0) {
chance = Integer.compare(st.scores[0].scoreNoStem, simNoStem);
}
if (chance == 0) {
chance = Integer.compare(st.scores[0].adjustedScore, simExactly);
}
return chance != 1;
}
/**
* Add near string into result list. Near strings sorted by "similarity,simAdjusted"
*/
protected void addNearString(final EntryKey key, final String source, final String translation,
NearString.MATCH_SOURCE comesFrom, final boolean fuzzy, final int similarity,
final int similarityNoStem, final int simAdjusted, final byte[] similarityData,
final String tmxName, final String creator, final long creationDate, final String changer,
final long changedDate, final List<TMXProp> tuProperties) {
// find position for new data
int pos = 0;
for (int i = 0; i < result.size(); i++) {
NearString st = result.get(i);
if (source.equals(st.source) && Objects.equals(translation, st.translation)) {
// Consolidate identical matches from different sources into a single NearString with
// multiple project entries.
result.set(i, NearString.merge(st, key, source, translation, comesFrom, fuzzy, similarity,
similarityNoStem, simAdjusted, similarityData, tmxName, creator, creationDate,
changer, changedDate, tuProperties));
return;
}
if (st.scores[0].score < similarity) {
break;
}
if (st.scores[0].score == similarity) {
if (st.scores[0].scoreNoStem < similarityNoStem) {
break;
}
if (st.scores[0].scoreNoStem == similarityNoStem) {
if (st.scores[0].adjustedScore < simAdjusted) {
break;
}
// Patch contributed by Antonio Vilei
// text with the same case has precedence
if (similarity == 100 && !st.source.equals(srcText) && source.equals(srcText)) {
break;
}
}
}
pos = i + 1;
}
result.add(pos, new NearString(key, source, translation, comesFrom, fuzzy, similarity,
similarityNoStem, simAdjusted, similarityData, tmxName, creator, creationDate, changer,
changedDate, tuProperties));
if (result.size() > maxCount) {
result.remove(result.size() - 1);
}
}
/*
* Methods for tokenize strings with caching.
*/
Map<String, Token[]> tokenizeStemCache = new HashMap<String, Token[]>();
Map<String, Token[]> tokenizeNoStemCache = new HashMap<String, Token[]>();
Map<String, Token[]> tokenizeAllCache = new HashMap<String, Token[]>();
public Token[] tokenizeStem(String str) {
Token[] result = tokenizeStemCache.get(str);
if (result == null) {
result = tok.tokenizeWords(str, ITokenizer.StemmingMode.MATCHING);
tokenizeStemCache.put(str, result);
}
return result;
}
public Token[] tokenizeNoStem(String str) {
// No-stemming token comparisons are intentionally case-insensitive
// for matching purposes.
str = str.toLowerCase(srcLocale);
Token[] result = tokenizeNoStemCache.get(str);
if (result == null) {
result = tok.tokenizeWords(str, ITokenizer.StemmingMode.NONE);
tokenizeNoStemCache.put(str, result);
}
return result;
}
public Token[] tokenizeAll(String str) {
// Verbatim token comparisons are intentionally case-insensitive.
// for matching purposes.
str = str.toLowerCase(srcLocale);
Token[] result = tokenizeAllCache.get(str);
if (result == null) {
result = tok.tokenizeVerbatim(str);
tokenizeAllCache.put(str, result);
}
return result;
}
protected void checkStopped(IStopped stop) throws StoppedException {
if (stop.isStopped()) {
throw new StoppedException();
}
}
/**
* Process will throw this exception if it stopped.All callers must catch it and just skip.
*/
@SuppressWarnings("serial")
public static class StoppedException extends RuntimeException {
}
}