FindMatches.java example

Explorer
OmegaT-master
/**************************************************************************
 OmegaT - Computer Assisted Translation (CAT) tool
          with fuzzy matching, translation memory, keyword search,
          glossaries, and translation leveraging into updated projects.

 Copyright (C) 2000-2006 Keith Godfrey and Maxym Mykhalchuk
               2008 Alex Buloichik
               2012 Thomas Cordonnier, Martin Fleurke
               2013 Aaron Madlon-Kay, Alex Buloichik
               Home page: http://www.omegat.org/
               Support center: http://groups.yahoo.com/group/OmegaT/

 This file is part of OmegaT.

 OmegaT is free software: you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation, either version 3 of the License, or
 (at your option) any later version.

 OmegaT is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.

 You should have received a copy of the GNU General Public License
 along with this program.  If not, see <http://www.gnu.org/licenses/>.
 **************************************************************************/

package org.omegat.core.statistics;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Objects;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.omegat.core.Core;
import org.omegat.core.data.EntryKey;
import org.omegat.core.data.ExternalTMX;
import org.omegat.core.data.IProject;
import org.omegat.core.data.IProject.DefaultTranslationsIterator;
import org.omegat.core.data.IProject.MultipleTranslationsIterator;
import org.omegat.core.data.PrepareTMXEntry;
import org.omegat.core.data.SourceTextEntry;
import org.omegat.core.data.TMXEntry;
import org.omegat.core.events.IStopped;
import org.omegat.core.matching.FuzzyMatcher;
import org.omegat.core.matching.ISimilarityCalculator;
import org.omegat.core.matching.LevenshteinDistance;
import org.omegat.core.matching.NearString;
import org.omegat.core.segmentation.Rule;
import org.omegat.tokenizer.ITokenizer;
import org.omegat.util.Language;
import org.omegat.util.OConsts;
import org.omegat.util.OStrings;
import org.omegat.util.PatternConsts;
import org.omegat.util.TMXProp;
import org.omegat.util.Token;

/**
 * Class to find matches by specified criteria.
 *
 * Since we can use stemmers to prepare tokens, we should use 3-pass comparison of similarity. Similarity will
 * be calculated in 3 steps:
 *
 * 1. Split original segment into word-only tokens using stemmer (with stop words list), then compare tokens.
 *
 * 2. Split original segment into word-only tokens without stemmer, then compare tokens.
 *
 * 3. Split original segment into not-only-words tokens (including numbers and tags) without stemmer, then
 * compare tokens.
 *
 * This class is not thread safe ! Must be used in the one thread only.
 *
 * @author Maxym Mykhalchuk
 * @author Alex Buloichik (alex73mail@gmail.com)
 * @author Martin Fleurke
 * @author Aaron Madlon-Kay
 */
public class FindMatches {

    /**
    * According to gettext source code, PO fuzzies are created above 60%
    * https://sourceforge.net/p/omegat/feature-requests/1258/
    */
    static final int PENALTY_FOR_FUZZY = 40;
    private static final int PENALTY_FOR_REMOVED = 5;
    private static final int SUBSEGMENT_MATCH_THRESHOLD = 85;

    private static final Pattern SEARCH_FOR_PENALTY = Pattern.compile("penalty-(\\d+)");

    private static final String ORPHANED_FILE_NAME = OStrings.getString("CT_ORPHAN_STRINGS");

    private final ISimilarityCalculator distance = new LevenshteinDistance();

    /**
     * the removePattern that was configured by the user.
     */
    private final Pattern removePattern = PatternConsts.getRemovePattern();

    private final IProject project;
    private final ITokenizer tok;
    private final Locale srcLocale;
    private final int maxCount;

    /** Result list. */
    private List<NearString> result;

    private final boolean searchExactlyTheSame;
    private String srcText;

    /**
     * Text that was removed by the removePattern from the source text.
     */
    private String removedText;

    /** Tokens for original string, with and without stems. */
    private Token[] strTokensStem, strTokensNoStem;

    /** Tokens for original string, includes numbers and tags. */
    private Token[] strTokensAll;

    // This finder used for search separate segment matches
    private FindMatches separateSegmentMatcher;

    /**
     * @param searchExactlyTheSame
     *            allows to search similarities with the same text as source segment. This mode used only for
     *            separate sentence match in paragraph project, i.e. where source is just part of current
     *            source.
     */
    public FindMatches(IProject project, int maxCount, boolean allowSeparateSegmentMatch,
            boolean searchExactlyTheSame) {
        this.project = project;
        this.tok = project.getSourceTokenizer();
        this.srcLocale = project.getProjectProperties().getSourceLanguage().getLocale();
        this.maxCount = maxCount;
        this.searchExactlyTheSame = searchExactlyTheSame;
        if (allowSeparateSegmentMatch && !project.getProjectProperties().isSentenceSegmentingEnabled()) {
            separateSegmentMatcher = new FindMatches(project, 1, false, true);
        }
    }

    public List<NearString> search(final String searchText, final boolean requiresTranslation,
            final boolean fillSimilarityData, final IStopped stop) throws StoppedException {
        result = new ArrayList<>(OConsts.MAX_NEAR_STRINGS + 1);

        srcText = searchText;
        removedText = "";

        // remove part that is to be removed according to user settings.
        // Rationale: it might be a big string influencing the 'editing distance', while it is not really part
        // of the translatable text
        if (removePattern != null) {
            StringBuilder removedBuffer = new StringBuilder();
            Matcher removeMatcher = removePattern.matcher(srcText);
            while (removeMatcher.find()) {
                removedBuffer.append(removeMatcher.group());
            }
            srcText = removeMatcher.replaceAll("");
            removedText = removedBuffer.toString();
        }

        // get tokens for original string
        strTokensStem = tokenizeStem(srcText);
        strTokensNoStem = tokenizeNoStem(srcText);
        strTokensAll = tokenizeAll(srcText);
        /* HP: includes non - word tokens */

        // travel by project entries, including orphaned
        if (project.getProjectProperties().isSupportDefaultTranslations()) {
            project.iterateByDefaultTranslations(new DefaultTranslationsIterator() {
                public void iterate(String source, TMXEntry trans) {
                    checkStopped(stop);
                    if (!searchExactlyTheSame && source.equals(searchText)) {
                        // skip original==original entry comparison
                        return;
                    }
                    if (requiresTranslation && trans.translation == null) {
                        return;
                    }
                    String fileName = project.isOrphaned(source) ? ORPHANED_FILE_NAME : null;
                    processEntry(null, source, trans.translation, NearString.MATCH_SOURCE.MEMORY, false, 0,
                            fileName, trans.creator, trans.creationDate, trans.changer, trans.changeDate,
                            null);
                }
            });
        }
        project.iterateByMultipleTranslations(new MultipleTranslationsIterator() {
            public void iterate(EntryKey source, TMXEntry trans) {
                checkStopped(stop);
                if (!searchExactlyTheSame && source.sourceText.equals(searchText)) {
                    // skip original==original entry comparison
                    return;
                }
                if (requiresTranslation && trans.translation == null) {
                    return;
                }
                String fileName = project.isOrphaned(source) ? ORPHANED_FILE_NAME : null;
                processEntry(source, source.sourceText, trans.translation, NearString.MATCH_SOURCE.MEMORY,
                        false, 0, fileName, trans.creator, trans.creationDate, trans.changer,
                        trans.changeDate, null);
            }
        });

        // travel by translation memories
        for (Map.Entry<String, ExternalTMX> en : project.getTransMemories().entrySet()) {
            int penalty = 0;
            Matcher matcher = SEARCH_FOR_PENALTY.matcher(en.getKey());
            if (matcher.find()) {
                penalty = Integer.parseInt(matcher.group(1));
            }
            for (PrepareTMXEntry tmen : en.getValue().getEntries()) {
                checkStopped(stop);
                if (requiresTranslation && tmen.translation == null) {
                    continue;
                }
                processEntry(null, tmen.source, tmen.translation, NearString.MATCH_SOURCE.TM, false, penalty,
                        en.getKey(), tmen.creator, tmen.creationDate, tmen.changer, tmen.changeDate,
                        tmen.otherProperties);
            }
        }

        // travel by all entries for check source file translations
        for (SourceTextEntry ste : project.getAllEntries()) {
            checkStopped(stop);
            if (ste.getSourceTranslation() != null) {
                processEntry(ste.getKey(), ste.getSrcText(), ste.getSourceTranslation(),
                        NearString.MATCH_SOURCE.MEMORY, ste.isSourceTranslationFuzzy(), 0, ste.getKey().file,
                        "", 0, "", 0, null);
            }
        }

        if (separateSegmentMatcher != null) {
            // split paragraph even when segmentation disabled, then find matches for every segment
            List<StringBuilder> spaces = new ArrayList<StringBuilder>();
            List<Rule> brules = new ArrayList<Rule>();
            Language sourceLang = project.getProjectProperties().getSourceLanguage();
            Language targetLang = project.getProjectProperties().getTargetLanguage();
            List<String> segments = Core.getSegmenter().segment(sourceLang, srcText, spaces, brules);
            if (segments.size() > 1) {
                List<String> fsrc = new ArrayList<String>(segments.size());
                List<String> ftrans = new ArrayList<String>(segments.size());
                // multiple segments
                for (short i = 0; i < segments.size(); i++) {
                    String onesrc = segments.get(i);

                    // find match for separate segment
                    List<NearString> segmentMatch = separateSegmentMatcher.search(onesrc, requiresTranslation, false,
                            stop);
                    if (!segmentMatch.isEmpty()
                            && segmentMatch.get(0).scores[0].score >= SUBSEGMENT_MATCH_THRESHOLD) {
                        fsrc.add(segmentMatch.get(0).source);
                        ftrans.add(segmentMatch.get(0).translation);
                    } else {
                        fsrc.add("");
                        ftrans.add("");
                    }
                }
                // glue found sources
                String foundSrc = Core.getSegmenter().glue(sourceLang, sourceLang, fsrc, spaces, brules);
                // glue found translations
                String foundTrans = Core.getSegmenter().glue(sourceLang, targetLang, ftrans, spaces, brules);
                processEntry(null, foundSrc, foundTrans, NearString.MATCH_SOURCE.TM, false, 0, "", "", 0, "",
                        0, null);
            }
        }

        if (fillSimilarityData) {
            // fill similarity data only for result
            for (NearString near : result) {
                // fix for bug 1586397
                byte[] similarityData = FuzzyMatcher.buildSimilarityData(strTokensAll,
                        tokenizeAll(near.source));
                near.attr = similarityData;
            }
        }

        return result;
    }

    /**
     * Compare one entry with original entry.
     *
     * @param candEntry
     *            entry to compare
     */
    protected void processEntry(final EntryKey key, final String source, final String translation,
            NearString.MATCH_SOURCE comesFrom, final boolean fuzzy, final int penalty, final String tmxName,
            final String creator, final long creationDate, final String changer, final long changedDate,
            final List<TMXProp> props) {
        // remove part that is to be removed prior to tokenize
        String realSource = source;
        int realPenaltyForRemoved = 0;
        if (removePattern != null) {
            StringBuilder entryRemovedText = new StringBuilder();
            Matcher removeMatcher = removePattern.matcher(realSource);
            while (removeMatcher.find()) {
                entryRemovedText.append(removeMatcher.group());
            }
            realSource = removeMatcher.replaceAll("");
            // calculate penalty if something has been removed, otherwise different strings get 100% match.
            if (!entryRemovedText.toString().equals(removedText)) {
                // penalty for different 'removed'-part
                realPenaltyForRemoved = PENALTY_FOR_REMOVED;
            }
        }

        Token[] candTokens = tokenizeStem(realSource);

        // First percent value - with stemming if possible
        int similarityStem = FuzzyMatcher.calcSimilarity(distance, strTokensStem, candTokens);

        similarityStem -= penalty;
        if (fuzzy) {
            // penalty for fuzzy
            similarityStem -= PENALTY_FOR_FUZZY;
        }
        similarityStem -= realPenaltyForRemoved;

        // check if we have chance by first percentage only
        if (!haveChanceToAdd(similarityStem, Integer.MAX_VALUE, Integer.MAX_VALUE)) {
            return;
        }

        Token[] candTokensNoStem = tokenizeNoStem(realSource);
        // Second percent value - without stemming
        int similarityNoStem = FuzzyMatcher.calcSimilarity(distance, strTokensNoStem, candTokensNoStem);
        similarityNoStem -= penalty;
        if (fuzzy) {
            // penalty for fuzzy
            similarityNoStem -= PENALTY_FOR_FUZZY;
        }
        similarityNoStem -= realPenaltyForRemoved;

        // check if we have chance by first and second percentages
        if (!haveChanceToAdd(similarityStem, similarityNoStem, Integer.MAX_VALUE)) {
            return;
        }

        Token[] candTokensAll = tokenizeAll(realSource);
        // Third percent value - with numbers, tags, etc.
        int simAdjusted = FuzzyMatcher.calcSimilarity(distance, strTokensAll, candTokensAll);
        simAdjusted -= penalty;
        if (fuzzy) {
            // penalty for fuzzy
            simAdjusted -= PENALTY_FOR_FUZZY;
        }
        simAdjusted -= realPenaltyForRemoved;

        // check if we have chance by first, second and third percentages
        if (!haveChanceToAdd(similarityStem, similarityNoStem, simAdjusted)) {
            return;
        }

        addNearString(key, source, translation, comesFrom, fuzzy, similarityStem, similarityNoStem,
                simAdjusted, null, tmxName, creator, creationDate, changer, changedDate, props);
    }

    /**
     * Check if entry have a chance to be added to result list. If no, there is no sense to calculate other
     * parameters.
     *
     * @param simStem
     *            similarity with stemming
     * @param simNoStem
     *            similarity without stemming
     * @param simExactly
     *            exactly similarity
     * @return true if we have chance
     */
    protected boolean haveChanceToAdd(final int simStem, final int simNoStem, final int simExactly) {
        if (simStem < OConsts.FUZZY_MATCH_THRESHOLD && simNoStem < OConsts.FUZZY_MATCH_THRESHOLD) {
            return false;
        }
        if (result.size() < maxCount) {
            return true;
        }
        NearString st = result.get(result.size() - 1);
        int chance = Integer.compare(st.scores[0].score, simStem);
        if (chance == 0) {
            chance = Integer.compare(st.scores[0].scoreNoStem, simNoStem);
        }
        if (chance == 0) {
            chance = Integer.compare(st.scores[0].adjustedScore, simExactly);
        }
        return chance != 1;
    }

    /**
     * Add near string into result list. Near strings sorted by "similarity,simAdjusted"
     */
    protected void addNearString(final EntryKey key, final String source, final String translation,
            NearString.MATCH_SOURCE comesFrom, final boolean fuzzy, final int similarity,
            final int similarityNoStem, final int simAdjusted, final byte[] similarityData,
            final String tmxName, final String creator, final long creationDate, final String changer,
            final long changedDate, final List<TMXProp> tuProperties) {
        // find position for new data
        int pos = 0;
        for (int i = 0; i < result.size(); i++) {
            NearString st = result.get(i);
            if (source.equals(st.source) && Objects.equals(translation, st.translation)) {
                // Consolidate identical matches from different sources into a single NearString with
                // multiple project entries.
                result.set(i, NearString.merge(st, key, source, translation, comesFrom, fuzzy, similarity,
                        similarityNoStem, simAdjusted, similarityData, tmxName, creator, creationDate,
                        changer, changedDate, tuProperties));
                return;
            }
            if (st.scores[0].score < similarity) {
                break;
            }
            if (st.scores[0].score == similarity) {
                if (st.scores[0].scoreNoStem < similarityNoStem) {
                    break;
                }
                if (st.scores[0].scoreNoStem == similarityNoStem) {
                    if (st.scores[0].adjustedScore < simAdjusted) {
                        break;
                    }
                    // Patch contributed by Antonio Vilei
                    // text with the same case has precedence
                    if (similarity == 100 && !st.source.equals(srcText) && source.equals(srcText)) {
                        break;
                    }
                }
            }
            pos = i + 1;
        }

        result.add(pos, new NearString(key, source, translation, comesFrom, fuzzy, similarity,
                similarityNoStem, simAdjusted, similarityData, tmxName, creator, creationDate, changer,
                changedDate, tuProperties));
        if (result.size() > maxCount) {
            result.remove(result.size() - 1);
        }
    }

    /*
     * Methods for tokenize strings with caching.
     */
    Map<String, Token[]> tokenizeStemCache = new HashMap<String, Token[]>();
    Map<String, Token[]> tokenizeNoStemCache = new HashMap<String, Token[]>();
    Map<String, Token[]> tokenizeAllCache = new HashMap<String, Token[]>();

    public Token[] tokenizeStem(String str) {
        Token[] result = tokenizeStemCache.get(str);
        if (result == null) {
            result = tok.tokenizeWords(str, ITokenizer.StemmingMode.MATCHING);
            tokenizeStemCache.put(str, result);
        }
        return result;
    }

    public Token[] tokenizeNoStem(String str) {
        // No-stemming token comparisons are intentionally case-insensitive
        // for matching purposes.
        str = str.toLowerCase(srcLocale);
        Token[] result = tokenizeNoStemCache.get(str);
        if (result == null) {
            result = tok.tokenizeWords(str, ITokenizer.StemmingMode.NONE);
            tokenizeNoStemCache.put(str, result);
        }
        return result;
    }

    public Token[] tokenizeAll(String str) {
        // Verbatim token comparisons are intentionally case-insensitive.
        // for matching purposes.
        str = str.toLowerCase(srcLocale);
        Token[] result = tokenizeAllCache.get(str);
        if (result == null) {
            result = tok.tokenizeVerbatim(str);
            tokenizeAllCache.put(str, result);
        }
        return result;
    }

    protected void checkStopped(IStopped stop) throws StoppedException {
        if (stop.isStopped()) {
            throw new StoppedException();
        }
    }

    /**
     * Process will throw this exception if it stopped.All callers must catch it and just skip.
     */
    @SuppressWarnings("serial")
    public static class StoppedException extends RuntimeException {
    }
}