/**************************************************************************
OmegaT - Computer Assisted Translation (CAT) tool
with fuzzy matching, translation memory, keyword search,
glossaries, and translation leveraging into updated projects.
Copyright (C) 2008 Alex Buloichik (alex73mail@gmail.com)
2013, 2015 Aaron Madlon-Kay
Home page: http://www.omegat.org/
Support center: http://groups.yahoo.com/group/OmegaT/
This file is part of OmegaT.
OmegaT is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
OmegaT is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
**************************************************************************/
package org.omegat.tokenizer;
import java.io.IOException;
import java.io.StringReader;
import java.text.BreakIterator;
import java.util.ArrayList;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.omegat.core.Core;
import org.omegat.core.CoreEvents;
import org.omegat.core.data.IProject;
import org.omegat.core.data.SourceTextEntry;
import org.omegat.core.events.IProjectEventListener;
import org.omegat.gui.comments.ICommentProvider;
import org.omegat.util.Language;
import org.omegat.util.Log;
import org.omegat.util.StringUtil;
import org.omegat.util.Token;
/**
* Base class for Lucene-based tokenizers.
*
* @author Alex Buloichik (alex73mail@gmail.com)
* @author Aaron Madlon-Kay
*/
public abstract class BaseTokenizer implements ITokenizer {
private static final Map<String, Token[]> tokenCacheNone = new ConcurrentHashMap<String, Token[]>(5000);
private static final Map<String, Token[]> tokenCacheMatching = new ConcurrentHashMap<String, Token[]>(5000);
private static final Map<String, Token[]> tokenCacheGlossary = new ConcurrentHashMap<String, Token[]>(5000);
protected static final String[] EMPTY_STRING_LIST = new String[0];
protected static final Token[] EMPTY_TOKENS_LIST = new Token[0];
protected static final int DEFAULT_TOKENS_COUNT = 64;
/**
* Indicates that {@link #tokenizeVerbatim(String)} should use OmegaT's
* {@link WordIterator} to tokenize "exactly" for display.
* <p>
* For language-specific tokenizers that maintain the property that
* <code>(the concatenation of all tokens).equals(original string) == true</code>,
* set this to false to use the language-specific tokenizer for everything.
*/
protected boolean shouldDelegateTokenizeExactly = true;
public BaseTokenizer() {
CoreEvents.registerProjectChangeListener(new IProjectEventListener() {
@Override
public void onProjectChanged(PROJECT_CHANGE_TYPE eventType) {
if (eventType == PROJECT_CHANGE_TYPE.CLOSE) {
tokenCacheNone.clear();
tokenCacheMatching.clear();
tokenCacheGlossary.clear();
}
}
});
}
/**
* {@inheritDoc}
*/
@Override
public Token[] tokenizeWords(final String strOrig, final StemmingMode stemmingMode) {
Map<String, Token[]> cache;
switch (stemmingMode) {
case NONE:
cache = tokenCacheNone;
break;
case GLOSSARY:
cache = tokenCacheGlossary;
break;
case MATCHING:
cache = tokenCacheMatching;
break;
default:
throw new RuntimeException("No cache for specified stemming mode");
}
Token[] result = cache.get(strOrig);
if (result != null) {
return result;
}
result = tokenize(strOrig,
stemmingMode == StemmingMode.GLOSSARY || stemmingMode == StemmingMode.MATCHING,
stemmingMode == StemmingMode.MATCHING,
stemmingMode != StemmingMode.GLOSSARY,
true);
// put result in the cache
cache.put(strOrig, result);
return result;
}
@Override
public String[] tokenizeWordsToStrings(String str, StemmingMode stemmingMode) {
return tokenizeToStrings(str,
stemmingMode == StemmingMode.GLOSSARY || stemmingMode == StemmingMode.MATCHING,
stemmingMode == StemmingMode.MATCHING,
stemmingMode != StemmingMode.GLOSSARY,
true);
}
/**
* {@inheritDoc}
*/
@Override
public Token[] tokenizeVerbatim(final String strOrig) {
if (StringUtil.isEmpty(strOrig)) {
return EMPTY_TOKENS_LIST;
}
if (!shouldDelegateTokenizeExactly) {
return tokenize(strOrig, false, false, false, false);
}
List<Token> result = new ArrayList<Token>(DEFAULT_TOKENS_COUNT);
WordIterator iterator = new WordIterator();
iterator.setText(strOrig);
int start = iterator.first();
for (int end = iterator.next(); end != BreakIterator.DONE; start = end,
end = iterator.next()) {
String tokenStr = strOrig.substring(start, end);
result.add(new Token(tokenStr, start));
}
return result.toArray(new Token[result.size()]);
}
@Override
public String[] tokenizeVerbatimToStrings(String str) {
if (StringUtil.isEmpty(str)) {
return EMPTY_STRING_LIST;
}
if (!shouldDelegateTokenizeExactly) {
return tokenizeToStrings(str, false, false, false, false);
}
List<String> result = new ArrayList<String>(DEFAULT_TOKENS_COUNT);
WordIterator iterator = new WordIterator();
iterator.setText(str);
int start = iterator.first();
for (int end = iterator.next(); end != BreakIterator.DONE; start = end,
end = iterator.next()) {
String tokenStr = str.substring(start, end);
result.add(tokenStr);
}
return result.toArray(new String[result.size()]);
}
protected Token[] tokenizeByCodePoint(String strOrig) {
// See http://www.ibm.com/developerworks/library/j-unicode/#1-5
// Example 1-5 appears to be faster than 1-6 for us (because our strings are short?)
Token[] tokens = new Token[strOrig.codePointCount(0, strOrig.length())];
for (int cp, i = 0, j = 0; i < strOrig.length(); i += Character.charCount(cp)) {
cp = strOrig.codePointAt(i);
tokens[j++] = new Token(String.valueOf(Character.toChars(cp)), i);
}
return tokens;
}
protected String[] tokenizeByCodePointToStrings(String strOrig) {
// See http://www.ibm.com/developerworks/library/j-unicode/#1-5
// Example 1-5 appears to be faster than 1-6 for us (because our strings are short?)
String[] tokens = new String[strOrig.codePointCount(0, strOrig.length())];
for (int cp, i = 0, j = 0; i < strOrig.length(); i += Character.charCount(cp)) {
cp = strOrig.codePointAt(i);
tokens[j++] = String.valueOf(Character.toChars(cp));
}
return tokens;
}
protected Token[] tokenize(final String strOrig, final boolean stemsAllowed, final boolean stopWordsAllowed,
final boolean filterDigits, final boolean filterWhitespace) {
if (StringUtil.isEmpty(strOrig)) {
return EMPTY_TOKENS_LIST;
}
List<Token> result = new ArrayList<Token>(64);
TokenStream in = null;
try {
in = getTokenStream(strOrig, stemsAllowed, stopWordsAllowed);
in.addAttribute(CharTermAttribute.class);
in.addAttribute(OffsetAttribute.class);
CharTermAttribute cattr = in.getAttribute(CharTermAttribute.class);
OffsetAttribute off = in.getAttribute(OffsetAttribute.class);
in.reset();
while (in.incrementToken()) {
String tokenText = cattr.toString();
if (acceptToken(tokenText, filterDigits, filterWhitespace)) {
result.add(new Token(tokenText, off.startOffset(), off.endOffset() - off.startOffset()));
}
}
in.end();
in.close();
} catch (IOException ex) {
Log.log(ex);
} finally {
IOUtils.closeQuietly(in);
}
return result.toArray(new Token[result.size()]);
}
protected String[] tokenizeToStrings(String str, boolean stemsAllowed, boolean stopWordsAllowed,
boolean filterDigits, boolean filterWhitespace) {
if (StringUtil.isEmpty(str)) {
return EMPTY_STRING_LIST;
}
List<String> result = new ArrayList<String>(64);
TokenStream in = null;
try {
in = getTokenStream(str, stemsAllowed, stopWordsAllowed);
in.addAttribute(CharTermAttribute.class);
in.addAttribute(OffsetAttribute.class);
CharTermAttribute cattr = in.getAttribute(CharTermAttribute.class);
OffsetAttribute off = in.getAttribute(OffsetAttribute.class);
Locale loc = stemsAllowed ? getLanguage().getLocale() : null;
in.reset();
while (in.incrementToken()) {
String tokenText = cattr.toString();
if (acceptToken(tokenText, filterDigits, filterWhitespace)) {
result.add(tokenText);
if (stemsAllowed) {
String origText = str.substring(off.startOffset(), off.endOffset());
if (!origText.toLowerCase(loc).equals(tokenText.toLowerCase(loc))) {
result.add(origText);
}
}
}
}
in.end();
in.close();
} catch (IOException ex) {
Log.log(ex);
} finally {
IOUtils.closeQuietly(in);
}
return result.toArray(new String[result.size()]);
}
private boolean acceptToken(String token, boolean filterDigits, boolean filterWhitespace) {
if (StringUtil.isEmpty(token)) {
return false;
}
if (!filterDigits && !filterWhitespace) {
return true;
}
boolean isWhitespaceOnly = true;
for (int i = 0, cp; i < token.length(); i += Character.charCount(cp)) {
cp = token.codePointAt(i);
if (filterDigits && Character.isDigit(cp)) {
return false;
}
if (filterWhitespace && !StringUtil.isWhiteSpace(cp)) {
isWhitespaceOnly = false;
}
}
return !(filterWhitespace && isWhitespaceOnly);
}
protected abstract TokenStream getTokenStream(String strOrig, boolean stemsAllowed, boolean stopWordsAllowed)
throws IOException;
/**
* Minimal implementation that returns the default implementation
* corresponding to all false parameters. Subclasses should override this to
* handle true parameters.
*/
protected TokenStream getStandardTokenStream(String strOrig) throws IOException {
StandardTokenizer tokenizer = new StandardTokenizer();
tokenizer.setReader(new StringReader(strOrig));
return tokenizer;
}
@Override
public String[] getSupportedLanguages() {
Tokenizer ann = getClass().getAnnotation(Tokenizer.class);
if (ann == null) {
throw new RuntimeException(getClass().getName() + " must have a "
+ Tokenizer.class.getName() + " annotation available at runtime.");
}
return ann.languages();
}
protected Language getLanguage() {
String[] languages = getSupportedLanguages();
if (languages.length == 0 || languages[0] == Tokenizer.DISCOVER_AT_RUNTIME) {
IProject proj = Core.getProject();
if (proj == null) {
throw new RuntimeException("This tokenizer's language can only be "
+ "determined in the context of a project, but project is null.");
} else if (proj.getSourceTokenizer() == this) {
return proj.getProjectProperties().getSourceLanguage();
} else if (proj.getTargetTokenizer() == this) {
return proj.getProjectProperties().getTargetLanguage();
} else {
throw new RuntimeException("This tokenizer's language can only be "
+ "determined in the context of a project, but is not assigned "
+ "to current project.");
}
}
return new Language(languages[0]);
}
protected String test(String... args) {
StringBuilder sb = new StringBuilder();
sb.append(getClass().getName()).append('\n');
for (String input : args) {
sb.append("Input:\n");
sb.append(input).append("\n");
sb.append("tokenizeVerbatim:\n");
sb.append(printTest(tokenizeVerbatimToStrings(input), input));
sb.append("tokenize:\n");
sb.append(printTest(tokenizeToStrings(input, false, false, false, true), input));
sb.append("tokenize (stemsAllowed):\n");
sb.append(printTest(tokenizeToStrings(input, true, false, false, true), input));
sb.append("tokenize (stemsAllowed stopWordsAllowed):\n");
sb.append(printTest(tokenizeToStrings(input, true, true, false, true), input));
sb.append("tokenize (stemsAllowed stopWordsAllowed filterDigits) (=tokenizeWords(MATCHING)):\n");
sb.append(printTest(tokenizeToStrings(input, true, true, true, true), input));
sb.append("tokenize (stemsAllowed filterDigits) (=tokenizeWords(GLOSSARY)):\n");
sb.append(printTest(tokenizeToStrings(input, true, false, true, true), input));
sb.append("tokenize (filterDigits) (=tokenizeWords(NONE)):\n");
sb.append(printTest(tokenizeToStrings(input, false, false, true, true), input));
sb.append("----------------------------------\n");
}
return sb.toString();
}
protected String printTest(String[] strings, String input) {
StringBuilder sb = new StringBuilder();
sb.append(StringUtils.join(strings, ", ")).append('\n');
sb.append("Is verbatim: ").append(StringUtils.join(strings, "").equals(input)).append('\n');
return sb.toString();
}
public static ICommentProvider TOKENIZER_DEBUG_PROVIDER = new ICommentProvider() {
@Override
public String getComment(SourceTextEntry newEntry) {
return ((BaseTokenizer) Core.getProject().getSourceTokenizer()).test(newEntry.getSrcText());
}
};
}