/************************************************************************** OmegaT - Computer Assisted Translation (CAT) tool with fuzzy matching, translation memory, keyword search, glossaries, and translation leveraging into updated projects. Copyright (C) 2000-2006 Keith Godfrey and Maxym Mykhalchuk Home page: http://www.omegat.org/ Support center: http://groups.yahoo.com/group/OmegaT/ This file is part of OmegaT. OmegaT is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. OmegaT is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>. **************************************************************************/ package org.omegat.core.matching; import org.omegat.core.data.StringData; import org.omegat.tokenizer.DefaultTokenizer; import org.omegat.util.Token; /** * The class, responsible for building the list of fuzzy matches between the * source text strings. * * @author Maxym Mykhalchuk */ public class FuzzyMatcher { private FuzzyMatcher() { } /** * Builds the similarity data for color highlight in match window. */ public static byte[] buildSimilarityData(Token[] sourceTokens, Token[] matchTokens) { int len = matchTokens.length; byte[] result = new byte[len]; boolean leftfound = true; for (int i = 0; i < len; i++) { result[i] = 0; Token righttoken = null; if (i + 1 < len) { righttoken = matchTokens[i + 1]; } boolean rightfound = (i + 1 == len) || DefaultTokenizer.isContains(sourceTokens, righttoken); Token token = matchTokens[i]; boolean found = DefaultTokenizer.isContains(sourceTokens, token); if (found && (!leftfound || !rightfound)) { result[i] = StringData.PAIR; } else if (!found) { result[i] = StringData.UNIQ; } leftfound = found; } return result; } /** * Calculate similarity for tokens arrays(percent). * * @param str * original string tokens * @param cand * candidate string tokens * @return similarity in percents */ public static int calcSimilarity(final ISimilarityCalculator distanceCalculator, final Token[] str, final Token[] cand) { if (str.length == 0 && cand.length == 0) { // empty token lists - can't calculate similarity return 0; } int ld = distanceCalculator.compute(str, cand); int similarity = (100 * (Math.max(str.length, cand.length) - ld)) / Math.max(str.length, cand.length); return similarity; } }