/**************************************************************************
OmegaT - Computer Assisted Translation (CAT) tool
with fuzzy matching, translation memory, keyword search,
glossaries, and translation leveraging into updated projects.
Copyright (C) 2000-2006 Keith Godfrey and Maxym Mykhalchuk
Home page: http://www.omegat.org/
Support center: http://groups.yahoo.com/group/OmegaT/
This file is part of OmegaT.
OmegaT is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
OmegaT is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
**************************************************************************/
package org.omegat.core.matching;
import org.omegat.core.data.StringData;
import org.omegat.tokenizer.DefaultTokenizer;
import org.omegat.util.Token;
/**
* The class, responsible for building the list of fuzzy matches between the
* source text strings.
*
* @author Maxym Mykhalchuk
*/
public class FuzzyMatcher {
private FuzzyMatcher() {
}
/**
* Builds the similarity data for color highlight in match window.
*/
public static byte[] buildSimilarityData(Token[] sourceTokens, Token[] matchTokens) {
int len = matchTokens.length;
byte[] result = new byte[len];
boolean leftfound = true;
for (int i = 0; i < len; i++) {
result[i] = 0;
Token righttoken = null;
if (i + 1 < len) {
righttoken = matchTokens[i + 1];
}
boolean rightfound = (i + 1 == len) || DefaultTokenizer.isContains(sourceTokens, righttoken);
Token token = matchTokens[i];
boolean found = DefaultTokenizer.isContains(sourceTokens, token);
if (found && (!leftfound || !rightfound)) {
result[i] = StringData.PAIR;
} else if (!found) {
result[i] = StringData.UNIQ;
}
leftfound = found;
}
return result;
}
/**
* Calculate similarity for tokens arrays(percent).
*
* @param str
* original string tokens
* @param cand
* candidate string tokens
* @return similarity in percents
*/
public static int calcSimilarity(final ISimilarityCalculator distanceCalculator, final Token[] str,
final Token[] cand) {
if (str.length == 0 && cand.length == 0) {
// empty token lists - can't calculate similarity
return 0;
}
int ld = distanceCalculator.compute(str, cand);
int similarity = (100 * (Math.max(str.length, cand.length) - ld)) / Math.max(str.length, cand.length);
return similarity;
}
}