/**************************************************************************
OmegaT - Computer Assisted Translation (CAT) tool
with fuzzy matching, translation memory, keyword search,
glossaries, and translation leveraging into updated projects.
Copyright (C) 2000-2006 Keith Godfrey and Maxym Mykhalchuk
2006 Henry Pijffers
2009 Didier Briel
2010 Martin Fleurke, Antonio Vilei, Alex Buloichik, Didier Briel
2013 Aaron Madlon-Kay, Alex Buloichik
2014 Alex Buloichik, Piotr Kulik, Aaron Madlon-Kay
2015 Aaron Madlon-Kay
Home page: http://www.omegat.org/
Support center: http://groups.yahoo.com/group/OmegaT/
This file is part of OmegaT.
OmegaT is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
OmegaT is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
**************************************************************************/
package org.omegat.core.search;
import java.io.IOException;
import java.nio.file.FileVisitOption;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.omegat.core.Core;
import org.omegat.core.data.EntryKey;
import org.omegat.core.data.ExternalTMX;
import org.omegat.core.data.IProject;
import org.omegat.core.data.IProject.FileInfo;
import org.omegat.core.data.ParseEntry;
import org.omegat.core.data.PrepareTMXEntry;
import org.omegat.core.data.ProjectProperties;
import org.omegat.core.data.ProjectTMX;
import org.omegat.core.data.ProtectedPart;
import org.omegat.core.data.SourceTextEntry;
import org.omegat.core.data.TMXEntry;
import org.omegat.core.threads.LongProcessThread;
import org.omegat.filters2.FilterContext;
import org.omegat.filters2.IParseCallback;
import org.omegat.filters2.TranslationException;
import org.omegat.filters2.master.FilterMaster;
import org.omegat.gui.glossary.GlossaryEntry;
import org.omegat.util.Language;
import org.omegat.util.Log;
import org.omegat.util.OStrings;
import org.omegat.util.StaticUtils;
import org.omegat.util.StringUtil;
/**
* This class implements search functionality. It is non-reentrant: each searcher instance must be used by a
* single thread.
*
* @author Keith Godfrey
* @author Maxym Mykhalchuk
* @author Henry Pijffers
* @author Didier Briel
* @author Martin Fleurke
* @author Antonio Vilei
* @author Alex Buloichik (alex73mail@gmail.com)
* @author Aaron Madlon-Kay
* @author Piotr Kulik
*/
public class Searcher {
/**
* Create new searcher instance.
*
* @param project
* Current project
*/
public Searcher(final IProject project, final SearchExpression expression) {
m_project = project;
this.expression = expression;
}
/**
* Set thread for checking interruption.
*/
public void setThread(LongProcessThread thread) {
checkStop = thread;
}
public SearchExpression getExpression() {
return expression;
}
/**
* Returns list of search results
*/
public List<SearchResultEntry> getSearchResults() {
if (m_preprocessResults) {
// function can be called multiple times after search
// results preprocess should occur only one time
m_preprocessResults = false;
if (!expression.allResults) {
for (SearchResultEntry entry : m_searchResults) {
String key = entry.getSrcText() + entry.getTranslation();
if (entry.getEntryNum() == ENTRY_ORIGIN_TRANSLATION_MEMORY) {
if (m_tmxMap.containsKey(key) && (m_tmxMap.get(key) > 0)) {
String newPreamble = StringUtil.format(OStrings.getString("SW_FILE_AND_NR_OF_MORE"),
entry.getPreamble(), m_tmxMap.get(key));
entry.setPreamble(newPreamble);
}
} else if (entry.getEntryNum() > ENTRY_ORIGIN_PROJECT_MEMORY) {
// at this stage each PM entry num is increased by 1
if (m_entryMap.containsKey(key) && (m_entryMap.get(key) > 0)) {
String newPreamble = StringUtil.isEmpty(entry.getPreamble())
? StringUtil.format(OStrings.getString("SW_NR_OF_MORE"), m_entryMap.get(key))
: StringUtil.format(OStrings.getString("SW_FILE_AND_NR_OF_MORE"),
entry.getPreamble(), m_entryMap.get(key));
entry.setPreamble(newPreamble);
}
}
}
}
}
return m_searchResults;
}
/**
* Search for an expression and return a list of results.
*
* @param expression
* what to search for (search text and options)
* @param maxResults
* maximum number of search results
* @throws Exception
*/
public void search() throws Exception {
m_searchExpression = expression;
String text = expression.text;
String author = expression.author;
m_searchResults = new ArrayList<SearchResultEntry>();
m_numFinds = 0;
// ensures that results will be preprocessed only one time
m_preprocessResults = true;
m_entryMap = null; // HP
m_entryMap = new HashMap<String, Integer>(); // HP
m_tmxMap = new HashMap<String, Integer>();
// create a list of matchers
m_matchers = new ArrayList<Matcher>();
// determine pattern matching flags
int flags = expression.caseSensitive ? 0 : Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE;
// Normalize width of search string if width insensitivity is requested.
// Then, instead of modifying the regex, we also normalize the
// comparison strings later on.
if (m_searchExpression.widthInsensitive) {
text = StringUtil.normalizeWidth(text);
}
// if exact search, just use the entire search string as a single
// search string; otherwise, if keyword, break up the string into
// separate words (= multiple search strings)
switch (expression.searchExpressionType) {
case EXACT:
default:
// escape the search string, it's not supposed to be a regular
// expression
text = StaticUtils.globToRegex(text, expression.spaceMatchNbsp);
// create a matcher for the search string
m_matchers.add(Pattern.compile(text, flags).matcher(""));
break;
case KEYWORD:
// break the search string into keywords,
// each of which is a separate search string
Pattern.compile(" ").splitAsStream(text.trim()).filter(word -> !word.isEmpty()).map(word -> {
String glob = StaticUtils.globToRegex(word, false);
return Pattern.compile(glob, flags).matcher("");
}).forEach(m_matchers::add);
break;
case REGEXP:
// space match nbsp (\u00a0)
if (expression.spaceMatchNbsp) {
text = text.replaceAll(" ", "( |\u00A0)");
text = text.replaceAll("\\\\s", "(\\\\s|\u00A0)");
}
// create a matcher for the search string
m_matchers.add(Pattern.compile(text, flags).matcher(""));
break;
}
// create a matcher for the author search string
if (expression.searchExpressionType != SearchExpression.SearchExpressionType.REGEXP) {
author = StaticUtils.globToRegex(author, expression.spaceMatchNbsp);
}
m_author = Pattern.compile(author, flags).matcher("");
if (expression.rootDir == null) {
// if no search directory specified, then we are
// searching current project only
searchProject();
} else {
searchFiles();
}
}
// ////////////////////////////////////////////////////////////
// internal functions
private void addEntry(int num, String preamble, String srcPrefix, String src, String target,
String note, SearchMatch[] srcMatch, SearchMatch[] targetMatch, SearchMatch[] noteMatch) {
SearchResultEntry entry = new SearchResultEntry(num, preamble, srcPrefix,
src, target, note, srcMatch,targetMatch, noteMatch);
m_searchResults.add(entry);
m_numFinds++;
}
/**
* Queue found string. Removes duplicate segments (by Henry Pijffers) except if m_allResults = true
*/
private void foundString(int entryNum, String intro, String src, String target, String note,
SearchMatch[] srcMatches, SearchMatch[] targetMatches, SearchMatch[] noteMatches) {
if (m_numFinds >= expression.numberOfResults) {
return;
}
String key = src + target;
// entries from project memory
if (entryNum >= ENTRY_ORIGIN_PROJECT_MEMORY) {
if (!m_entryMap.containsKey(key) || expression.allResults) {
// HP, duplicate entry prevention
// entries are referenced at offset 1 but stored at offset 0
String file = expression.fileNames ? getFileForEntry(entryNum + 1) : null;
addEntry(entryNum + 1, file, (entryNum + 1) + "> ", src, target,
note, srcMatches, targetMatches, noteMatches);
if (!expression.allResults) { // If we filter results
m_entryMap.put(key, 0); // HP
}
} else if (!expression.allResults) {
m_entryMap.put(key, m_entryMap.get(key) + 1);
}
} else if (entryNum == ENTRY_ORIGIN_TRANSLATION_MEMORY) {
// entries from translation memory
if (!m_tmxMap.containsKey(key) || expression.allResults) {
addEntry(entryNum, intro, null, src, target, note,
srcMatches, targetMatches, noteMatches);
if (!expression.allResults) {
// first occurence
m_tmxMap.put(key, 0);
}
} else if (!expression.allResults) {
// next occurence
m_tmxMap.put(key, m_tmxMap.get(key) + 1);
}
} else {
// all other entries
addEntry(entryNum, intro, null, src, target, note,
srcMatches, targetMatches, noteMatches);
}
}
private void searchProject() {
// reset the number of search hits
m_numFinds = 0;
// search the Memory, if requested
if (m_searchExpression.memory) {
// search through all project entries
IProject dataEngine = m_project;
for (int i = 0; i < m_project.getAllEntries().size(); i++) {
// stop searching if the max. nr of hits has been reached
if (m_numFinds >= expression.numberOfResults) {
return;
}
// get the source and translation of the next entry
SourceTextEntry ste = dataEngine.getAllEntries().get(i);
TMXEntry te = m_project.getTranslationInfo(ste);
checkEntry(ste.getSrcText(), te.translation, te.note, ste.getComment(), te, i, null);
checkStop.checkInterrupted();
}
// search in orphaned
if (!m_searchExpression.excludeOrphans) {
m_project.iterateByDefaultTranslations(new IProject.DefaultTranslationsIterator() {
final String file = OStrings.getString("CT_ORPHAN_STRINGS");
public void iterate(String source, TMXEntry en) {
// stop searching if the max. nr of hits has been reached
if (m_numFinds >= expression.numberOfResults) {
return;
}
checkStop.checkInterrupted();
if (m_project.isOrphaned(source)) {
checkEntry(en.source, en.translation, en.note, null, en, ENTRY_ORIGIN_ORPHAN, file);
}
}
});
m_project.iterateByMultipleTranslations(new IProject.MultipleTranslationsIterator() {
final String file = OStrings.getString("CT_ORPHAN_STRINGS");
public void iterate(EntryKey source, TMXEntry en) {
// stop searching if the max. nr of hits has been
// reached
if (m_numFinds >= expression.numberOfResults) {
return;
}
checkStop.checkInterrupted();
if (m_project.isOrphaned(source)) {
checkEntry(en.source, en.translation, en.note, null, en, ENTRY_ORIGIN_ORPHAN, file);
}
}
});
}
}
// search the TM, if requested
if (m_searchExpression.tm) {
// Search TM entries, unless we search for date or author.
// They are not loaded from external TM, so skip the search in
// that case.
if (!expression.searchAuthor && !expression.searchDateAfter && !expression.searchDateBefore) {
for (Map.Entry<String, ExternalTMX> tmEn : m_project.getTransMemories().entrySet()) {
final String fileTM = tmEn.getKey();
if (!searchEntries(tmEn.getValue().getEntries(), fileTM)) {
return;
}
checkStop.checkInterrupted();
}
for (Map.Entry<Language, ProjectTMX> tmEn : m_project.getOtherTargetLanguageTMs().entrySet()) {
final Language langTM = tmEn.getKey();
if (!searchEntriesAlternative(tmEn.getValue().getDefaults(), langTM.getLanguage())) {
return;
}
if (!searchEntriesAlternative(tmEn.getValue().getAlternatives(), langTM.getLanguage())) {
return;
}
checkStop.checkInterrupted();
}
}
}
// search the glossary, if requested
if (m_searchExpression.glossary) {
String intro = OStrings.getString("SW_GLOSSARY_RESULT");
List<GlossaryEntry> entries = Core.getGlossaryManager().search(m_searchExpression.text);
for (GlossaryEntry en : entries) {
checkEntry(en.getSrcText(), en.getLocText(), null, null, null, ENTRY_ORIGIN_GLOSSARY, intro);
// stop searching if the max. nr of hits has been reached
if (m_numFinds >= expression.numberOfResults) {
return;
}
checkStop.checkInterrupted();
}
}
}
private String getFileForEntry(int i) {
List<FileInfo> fileList = Core.getProject().getProjectFiles();
for (FileInfo fi : fileList) {
int first = fi.entries.get(0).entryNum();
int last = fi.entries.get(fi.entries.size() - 1).entryNum();
if (i >= first && i <= last) {
return fi.filePath;
}
}
return null;
}
/**
* Loops over collection of TMXEntries and checks every entry.
* If max nr of hits have been reached or serach has been stopped,
* the function stops and returns false. Else it finishes and returns true;
*
* @param tmEn collection of TMX Entries to check.
* @param tmxID identifier of the TMX. E.g. the filename or language code
* @return true when finished and all entries checked,
* false when search has stopped before all entries have been checked.
*/
private boolean searchEntries(Collection<PrepareTMXEntry> tmEn, final String tmxID) {
for (PrepareTMXEntry tm : tmEn) {
// stop searching if the max. nr of hits has been reached
if (m_numFinds >= expression.numberOfResults) {
return false;
}
// for alternative translations:
// - it is not feasible to get the sourcetextentry that matches the tm.source, so we cannot get the entryNum
// and real translation
// - although the 'translation' is used as 'source', we search it as translation, else we cannot show to
// which real source it belongs
checkEntry(tm.source, tm.translation, tm.note, null, null, ENTRY_ORIGIN_TRANSLATION_MEMORY, tmxID);
checkStop.checkInterrupted();
}
return true;
}
private boolean searchEntriesAlternative(Collection<TMXEntry> tmEn, final String tmxID) {
for (TMXEntry tm : tmEn) {
// stop searching if the max. nr of hits has been reached
if (m_numFinds >= expression.numberOfResults) {
return false;
}
// for alternative translations:
// - it is not feasible to get the sourcetextentry that matches the tm.source, so we cannot get the entryNum
// and real translation
// - although the 'translation' is used as 'source', we search it as translation, else we cannot show to
// which real source it belongs
checkEntry(tm.source, tm.translation, tm.note, null, null, ENTRY_ORIGIN_ALTERNATIVE, tmxID);
checkStop.checkInterrupted();
}
return true;
}
/**
* Check if specified entry should be found.
*
* @param srcText
* source text
* @param locText
* translation text
* @param note
* note text
* @param comment
* comment text
* @param entry
* entry. Null for external tmx entries (so we can only search for source and translation in external
* tmx)
* @param entryNum
* entry number
* @param intro
* file
*/
protected void checkEntry(String srcText, String locText, String note,
String comment, TMXEntry entry, int entryNum, String intro) {
SearchMatch[] srcMatches = null;
SearchMatch[] targetMatches = null;
SearchMatch[] srcOrTargetMatches = null;
SearchMatch[] noteMatches = null;
SearchMatch[] commentMatches = null;
switch (m_searchExpression.mode) {
case SEARCH:
if (expression.searchTranslated && !expression.searchUntranslated && locText == null) {
return;
}
if (!expression.searchTranslated && expression.searchUntranslated && locText != null) {
return;
}
if (expression.searchSource && searchString(srcText)) {
srcMatches = foundMatches.toArray(new SearchMatch[foundMatches.size()]);
}
if (expression.searchTarget && searchString(locText)) {
targetMatches = foundMatches.toArray(new SearchMatch[foundMatches.size()]);
}
// If
// - we are searching both source and target
// - we and haven't found a match in either so far
// - we have a target
// then we also search the concatenation of source and target per
// https://sourceforge.net/p/omegat/feature-requests/1185/
// We join with U+E000 (private use) to prevent spuriously matching
// e.g. "abc" in "fab" + "cat"
if (expression.searchSource && expression.searchTarget && locText != null && srcMatches == null
&& targetMatches == null && searchString(srcText + '\ue000' + locText)) {
srcOrTargetMatches = foundMatches.toArray(new SearchMatch[foundMatches.size()]);
}
if (expression.searchNotes && searchString(note)) {
noteMatches = foundMatches.toArray(new SearchMatch[foundMatches.size()]);
}
if (expression.searchComments && searchString(comment)) {
commentMatches = foundMatches.toArray(new SearchMatch[foundMatches.size()]);
}
break;
case REPLACE:
if (m_searchExpression.replaceTranslated && locText != null) {
if (searchString(locText)) {
targetMatches = foundMatches.toArray(new SearchMatch[foundMatches.size()]);
}
} else if (m_searchExpression.replaceUntranslated && locText == null) {
if (searchString(srcText)) {
srcMatches = foundMatches.toArray(new SearchMatch[foundMatches.size()]);
}
}
break;
}
// if the search expression is satisfied, report the hit
if ((srcMatches != null || targetMatches != null || srcOrTargetMatches != null || noteMatches != null
|| commentMatches != null)
&& (!expression.searchAuthor || entry != null && searchAuthor(entry))
&& (!expression.searchDateBefore
|| entry != null && entry.changeDate != 0 && entry.changeDate < expression.dateBefore)
&& (!expression.searchDateAfter
|| entry != null && entry.changeDate != 0 && entry.changeDate > expression.dateAfter)) {
// found
foundString(entryNum, intro, srcText, locText, note,
srcMatches, targetMatches, noteMatches);
}
}
private void searchFiles() throws Exception {
Path root = Paths.get(expression.rootDir);
FilterMaster fm = Core.getFilterMaster();
final SearchCallback searchCallback = new SearchCallback(m_project.getProjectProperties());
int depth = expression.recursive ? Integer.MAX_VALUE : 0;
Files.walk(root, depth, FileVisitOption.FOLLOW_LINKS).forEach(path -> {
String filename = path.toString();
FileInfo fi = new FileInfo();
// determine actual file name w/ no root path info
fi.filePath = root.relativize(path).toString();
searchCallback.setCurrentFile(fi);
try {
fm.loadFile(filename, new FilterContext(m_project.getProjectProperties()), searchCallback);
} catch (TranslationException | IOException ex) {
Log.log(ex);
}
searchCallback.fileFinished();
checkStop.checkInterrupted();
});
}
protected class SearchCallback extends ParseEntry implements IParseCallback {
private String filename;
public SearchCallback(ProjectProperties config) {
super(config);
}
@Override
public void setCurrentFile(FileInfo fi) {
super.setCurrentFile(fi);
filename = fi.filePath;
}
@Override
protected void fileFinished() {
super.fileFinished();
}
@Override
protected void addSegment(String id, short segmentIndex, String segmentSource,
List<ProtectedPart> protectedParts, String segmentTranslation, boolean segmentTranslationFuzzy,
String[] props, String prevSegment, String nextSegment, String path) {
searchText(segmentSource, segmentTranslation, filename);
}
}
// /////////////////////////////////////////////////////////////////////
// search algorithm
/**
* Looks for an occurrence of the search string(s) in the supplied text string.
*
* @param text
* The text string to search in
*
* @return True if the text string contains all search strings
*/
public boolean searchString(String origText) {
if (origText == null || m_matchers == null || m_matchers.isEmpty()) {
return false;
}
String text = m_searchExpression.widthInsensitive ? StringUtil.normalizeWidth(origText) : origText;
foundMatches.clear();
// check the text against all matchers
for (Matcher matcher : m_matchers) {
// check the text against the current matcher
// if one of the search strings is not found, don't
// bother looking for the rest of the search strings
matcher.reset(text);
if (!matcher.find()) {
return false;
}
// Check if we searched a string of different length from the
// original. If so, then we give up on highlighting this hit
// because the offsets and length will not match. We still return
// true so the hit will still be recorded.
if (text != origText && text.length() != origText.length()) {
continue;
}
while (true) {
int start = matcher.start();
foundMatches.add(new SearchMatch(start, matcher.end()));
if (start >= text.length() || !matcher.find(start + 1)) {
break;
}
}
}
// if we arrive here, all search strings have been matched,
// so this is a hit
// merge overlapped matches for better performance to mark on UI
Collections.sort(foundMatches);
for (int i = 1; i < foundMatches.size();) {
SearchMatch pr = foundMatches.get(i - 1);
SearchMatch cu = foundMatches.get(i);
// check for overlapped
if (pr.getStart() <= cu.getStart() && pr.getEnd() >= cu.getStart()) {
int end = Math.max(cu.getEnd(), pr.getEnd());
// leave only one region
pr = new SearchMatch(pr.getStart(), end);
foundMatches.set(i - 1, pr);
foundMatches.remove(i);
} else {
i++;
}
}
return true;
}
public List<SearchMatch> getFoundMatches() {
return foundMatches;
}
/**
* Looks for an occurrence of the author search string in the supplied text string.
*
* @param author
* The text string to search in
*
* @return True if the text string contains the search string
*/
private boolean searchAuthor(TMXEntry te) {
if (te == null || m_author == null) {
return false;
}
if (m_author.pattern().pattern().equals("")) {
// Handle search for null author.
return te.changer == null && te.creator == null;
}
if (te.changer != null) {
m_author.reset(te.changer);
if (m_author.find()) {
return true;
}
}
if (te.creator != null) {
m_author.reset(te.creator);
if (m_author.find()) {
return true;
}
}
return false;
}
// ///////////////////////////////////////////////////////////////
// interface used by FileHandlers
public void searchText(String seg, String translation, String filename) {
// don't look further if the max. nr of hits has been reached
if (m_numFinds >= expression.numberOfResults) {
return;
}
checkStop.checkInterrupted();
if (!m_searchExpression.searchTranslated) {
if (translation == null) {
return;
}
}
if (searchString(seg)) {
SearchMatch[] matches = foundMatches.toArray(new SearchMatch[foundMatches.size()]);
// found a match - do something about it
foundString(ENTRY_ORIGIN_TEXT, filename, seg, null, null, matches, null, null);
}
}
public interface ISearchCheckStop {
boolean isStopped();
}
private volatile List<SearchResultEntry> m_searchResults;
private boolean m_preprocessResults;
private IProject m_project;
private Map<String, Integer> m_tmxMap; // keeps track of previous results not from project memory
private Map<String, Integer> m_entryMap; // HP: keeps track of previous results, to
// avoid duplicate entries
private List<Matcher> m_matchers; // HP: contains a matcher for each search
// string
// (multiple if keyword search)
private Matcher m_author;
private int m_numFinds;
private SearchExpression m_searchExpression;
private final SearchExpression expression;
private LongProcessThread checkStop;
private final List<SearchMatch> foundMatches = new ArrayList<SearchMatch>();
// PM entries 0+
// Only PM and TM are counted (separately) for '+X more' statistics
private static final int ENTRY_ORIGIN_PROJECT_MEMORY = 0;
private static final int ENTRY_ORIGIN_TRANSLATION_MEMORY = -1;
private static final int ENTRY_ORIGIN_ORPHAN = -2;
private static final int ENTRY_ORIGIN_ALTERNATIVE = -3;
private static final int ENTRY_ORIGIN_GLOSSARY = -4;
private static final int ENTRY_ORIGIN_TEXT = -5;
}