/**************************************************************************
OmegaT - Computer Assisted Translation (CAT) tool
with fuzzy matching, translation memory, keyword search,
glossaries, and translation leveraging into updated projects.
Copyright (C) 2000-2006 Keith Godfrey, Maxym Mykhalchuk, and Henry Pijffers
2007 Didier Briel, Zoltan Bartko
2008 Alex Buloichik
2015 Didier Briel, Aaron Madlon-Kay
Home page: http://www.omegat.org/
Support center: http://groups.yahoo.com/group/OmegaT/
This file is part of OmegaT.
OmegaT is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
OmegaT is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
**************************************************************************/
package org.omegat.tokenizer;
import java.text.BreakIterator;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.omegat.core.CoreEvents;
import org.omegat.core.events.IProjectEventListener;
import org.omegat.util.PatternConsts;
import org.omegat.util.StringUtil;
import org.omegat.util.Token;
/**
* Methods for tokenize string.
*
* @author Keith Godfrey
* @author Maxym Mykhalchuk
* @author Henry Pijffers (henry.pijffers@saxnot.com)
* @author Didier Briel
* @author Zoltan Bartko - bartkozoltan@bartkozoltan.com
* @author Alex Buloichik
* @author Aaron Madlon-Kay
*/
public class DefaultTokenizer implements ITokenizer {
/**
* Contains a list of tokens for each *unique* string. By not storing a list
* of tokens for every string, memory is saved. Token lists are not saved
* when all tokens are requested. Again to save memory.
*/
private static Map<String, Token[]> tokenCache = new HashMap<String, Token[]>(5000);
private static final Token[] EMPTY_TOKENS_LIST = new Token[0];
private static final String[] EMPTY_STRINGS_LIST = new String[0];
public DefaultTokenizer() {
CoreEvents.registerProjectChangeListener(new IProjectEventListener() {
public void onProjectChanged(PROJECT_CHANGE_TYPE eventType) {
if (eventType == PROJECT_CHANGE_TYPE.CLOSE) {
// clear cache
synchronized (tokenCache) {
tokenCache.clear();
}
}
}
});
}
/**
* {@inheritDoc}
*/
public Token[] tokenizeWords(final String strOrig, final StemmingMode stemmingMode) {
if (StringUtil.isEmpty(strOrig)) {
return EMPTY_TOKENS_LIST;
}
Token[] result;
synchronized (tokenCache) {
result = tokenCache.get(strOrig);
}
if (result != null) {
return result;
}
result = tokenizeTextNoCache(strOrig, false);
// put result in the cache
synchronized (tokenCache) {
tokenCache.put(strOrig, result);
}
return result;
}
@Override
public String[] tokenizeWordsToStrings(String str, StemmingMode stemmingMode) {
if (StringUtil.isEmpty(str)) {
return EMPTY_STRINGS_LIST;
}
return tokenizeTextToStringsNoCache(str, false);
}
@Override
public Token[] tokenizeVerbatim(final String strOrig) {
return tokenizeTextNoCache(strOrig, true);
}
@Override
public String[] tokenizeVerbatimToStrings(String str) {
return tokenizeTextToStringsNoCache(str, true);
}
/**
* Breaks a string into tokens.
* <p>
* Examples:
* <ul>
* <li>This is a semi-good way. -> "this", "is", "a", "semi-good", "way"
* <li>Fine, thanks, and you? -> "fine", "thanks", "and", "you"
* <li>C&all this action -> "call", "this", "action" ('&' is eaten)
* </ul>
* <p>
* OmegaT tags and other non-word tokens are skipped if the parameter "all"
* is false.
*
* @param str
* string to tokenize
* @param all
* If true, numbers, tags, and other non-word tokens are included
* in the list
* @return array of tokens (all)
*/
private static Token[] tokenizeTextNoCache(final String strOrig, final boolean all) {
if (StringUtil.isEmpty(strOrig)) {
// fixes bug nr. 1382810 (StringIndexOutOfBoundsException)
return EMPTY_TOKENS_LIST;
}
// create a new token list
List<Token> tokens = new ArrayList<Token>(64);
// get a word breaker
BreakIterator breaker = getWordBreaker();
breaker.setText(strOrig);
int start = breaker.first();
for (int end = breaker.next(); end != BreakIterator.DONE; start = end, end = breaker.next()) {
String tokenStr = strOrig.substring(start, end);
if (all) {
// Accepting all tokens
tokens.add(new Token(tokenStr, start));
continue;
}
// Accepting only words that aren't OmegaT tags
boolean word = false;
for (int cp, i = 0; i < tokenStr.length(); i += Character.charCount(cp)) {
cp = tokenStr.codePointAt(i);
if (Character.isLetter(cp)) {
word = true;
break;
}
}
if (word && !PatternConsts.OMEGAT_TAG.matcher(tokenStr).matches()) {
tokens.add(new Token(tokenStr, start));
}
}
return tokens.toArray(new Token[tokens.size()]);
}
private static String[] tokenizeTextToStringsNoCache(String str, boolean all) {
if (StringUtil.isEmpty(str)) {
return EMPTY_STRINGS_LIST;
}
// create a new token list
List<String> tokens = new ArrayList<String>(64);
// get a word breaker
BreakIterator breaker = getWordBreaker();
breaker.setText(str);
int start = breaker.first();
for (int end = breaker.next(); end != BreakIterator.DONE; start = end, end = breaker.next()) {
String tokenStr = str.substring(start, end);
if (all) {
// Accepting all tokens
tokens.add(tokenStr);
continue;
}
// Accepting only words that aren't OmegaT tags
boolean word = false;
for (int cp, i = 0; i < tokenStr.length(); i += Character.charCount(cp)) {
cp = tokenStr.codePointAt(i);
if (Character.isLetter(cp)) {
word = true;
break;
}
}
if (word && !PatternConsts.OMEGAT_TAG.matcher(tokenStr).matches()) {
tokens.add(tokenStr);
}
}
return tokens.toArray(new String[tokens.size()]);
}
/** Returns an iterator to break sentences into words. */
public static BreakIterator getWordBreaker() {
// if (wordBreaker==null)
// wordBreaker = new WordIterator();
// return wordBreaker;
return new WordIterator();
// HP: This is a fix for bug 1589484. If you use only one
// WordIterator instance, it will lead to problems when
// using multiple threads, as OmegaT does. Sometimes, in
// the middle of breaking a string, another thread may set
// a different text, and then you get index out of bounds
// exceptions. By returning a new WordIterator each time
// one is requested, this problem is solved, and it doesn't
// hurt performance either.
}
/**
* Check if array contains token.
*/
public static boolean isContains(Token[] tokensList, Token tokenForCheck) {
for (Token t : tokensList) {
if (tokenForCheck.equals(t)) {
return true;
}
}
return false;
}
/**
* Check if array contains other array.
* @param tokensList a list of tokens to be searched
* @param listForFind a list of tokens to search in tokensList
* @param notExact is true if the tokens in listForFind can be non-contiguous or in a different order in the
* tokensList. If false, tokens must be exactly the same.
* @return true if the tokens in listForFind are found in tokensList
*/
public static boolean isContainsAll(Token[] tokensList, Token[] listForFind, boolean notExact) {
if (notExact) {
for (Token t : listForFind) {
if (!isContains(tokensList, t)) {
return false;
}
}
return true;
} else {
return isContainsExact(tokensList, listForFind);
}
}
/**
* Check if a list of tokens is found contiguously in another list of tokens
* @param tokensList a list of tokens to be searched
* @param listForFind a list of tokens to search in tokensList
* @return true if the tokens in listForFind are found contiguously in tokensList
*/
private static boolean isContainsExact(Token[] tokensList, Token[] listForFind) {
for (int i=0; i<tokensList.length; i++) { // For all tokens in the searched strings
if (tokensList[i].equals(listForFind[0])) { // We found the first position of listForFind
if (listForFind.length == 1) { // Only one token, and we found it
return true;
}
int k = i+1;
if (listForFind.length <= tokensList.length-k+1) { // Enough words remain to match tokensList
boolean found = true;
for (int j=1; j<listForFind.length; j++) {
if (!listForFind[j].equals(tokensList[k])) { // One of the other tokens doesn't match
found = false;
break;
}
k++;
}
if (found) { // All tokens matched
return true;
}
} else {
return false;
}
}
}
return false;
}
@Override
public String[] getSupportedLanguages() {
return new String[0];
}
}