/************************************************************************** OmegaT - Computer Assisted Translation (CAT) tool with fuzzy matching, translation memory, keyword search, glossaries, and translation leveraging into updated projects. Copyright (C) 2013, 2015 Aaron Madlon-Kay Home page: http://www.omegat.org/ Support center: http://groups.yahoo.com/group/OmegaT/ This file is part of OmegaT. OmegaT is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. OmegaT is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>. **************************************************************************/ package org.omegat.tokenizer; import java.io.IOException; import java.io.StringReader; import java.util.ArrayDeque; import java.util.Arrays; import java.util.Collections; import java.util.Set; import java.util.regex.Matcher; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.ja.JapaneseAnalyzer; import org.apache.lucene.analysis.ja.JapaneseTokenizer; import org.apache.lucene.analysis.ja.JapaneseTokenizer.Mode; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.util.CharArraySet; import org.omegat.util.PatternConsts; /** * @author Aaron Madlon-Kay */ @Tokenizer(languages = { "ja" }, isDefault = true) public class LuceneJapaneseTokenizer extends BaseTokenizer { public LuceneJapaneseTokenizer() { super(); shouldDelegateTokenizeExactly = false; } @SuppressWarnings("resource") @Override protected TokenStream getTokenStream(String strOrig, boolean stemsAllowed, boolean stopWordsAllowed) throws IOException { if (stemsAllowed) { // Blank out tags when stemming only strOrig = blankOutTags(strOrig); CharArraySet stopWords = stopWordsAllowed ? JapaneseAnalyzer.getDefaultStopSet() : CharArraySet.EMPTY_SET; Set<String> stopTags = stopWordsAllowed ? JapaneseAnalyzer.getDefaultStopTags() : Collections.emptySet(); return new JapaneseAnalyzer(null, Mode.SEARCH, stopWords, stopTags).tokenStream("", new StringReader(strOrig)); } else { JapaneseTokenizer tokenizer = new JapaneseTokenizer(null, false, Mode.NORMAL); tokenizer.setReader(new StringReader(strOrig)); return new TagJoiningFilter(tokenizer); } } /** * Replace all instances of OmegaT-style tags (<x0>, etc.) with blank spaces * of equal length. This is done because * <ul><li>This tokenizer will turn "<x0>" into [x, 0], leaving the alphabetical part * intact (other tokenizers are expected to produce [x0], which is suppressed by digit filtering) * <li>Instead of merely removing the tags, they are replaced with spaces so that * the tokens produced correctly line up with the original, unmodified string. * </ul> */ private String blankOutTags(String text) { StringBuilder buffer = new StringBuilder(text); Matcher m = PatternConsts.OMEGAT_TAG.matcher(text); while (m.find()) { for (int i = m.start(), end = m.end(); i < end; i++) { buffer.setCharAt(i, ' '); } } return buffer.toString(); } /** * This filter will reassemble OmegaT-style tags (<x0>, etc.) that this tokenizer * has broken apart. It is only meant to recognize "OmegaT-style tags". It has limited * recovery capability when encountering false positives. */ private static class TagJoiningFilter extends TokenFilter { private static final int BUFFER_INITIAL_SIZE = 5; private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); private StringBuilder buffer = new StringBuilder(BUFFER_INITIAL_SIZE); private int startOffset = -1; private boolean buffering = false; private final ArrayDeque<CachedToken> inputStack = new ArrayDeque<CachedToken>(); private final ArrayDeque<CachedToken> outputStack = new ArrayDeque<CachedToken>(); private final ArrayDeque<CachedToken> recoveryStack = new ArrayDeque<CachedToken>(); protected TagJoiningFilter(TokenStream input) { super(input); } @Override public boolean incrementToken() throws IOException { if (!outputStack.isEmpty()) { replayToken(outputStack.poll()); return true; } while (getNextInput()) { char[] chars = termAtt.buffer(); int len = termAtt.length(); if (buffering) { if (finishBuffering(chars, len)) { return true; } if (cancelBuffering(chars, len)) { return true; } cacheRecoveryToken(chars, len); buffer.append(chars, 0, len); continue; } if (startBuffering(chars, len)) { continue; } return true; } return finishToken(); } private boolean getNextInput() throws IOException { if (!inputStack.isEmpty()) { replayToken(inputStack.poll()); return true; } return input.incrementToken(); } private boolean startBuffering(char[] chars, int len) { for (int i = 0; i < len; i++) { if (isTagOpen(chars[i])) { if (i > 0) { // return true for content up to start, then replay start cacheInputToken(Arrays.copyOfRange(chars, i, len), offsetAtt.startOffset() + i); truncateToken(i); return false; } else { buffer.append(chars, i, len); startOffset = offsetAtt.startOffset(); cacheRecoveryToken(chars, len); buffering = true; return true; } } } return false; } private void truncateToken(int end) { termAtt.setLength(end); offsetAtt.setOffset(offsetAtt.startOffset(), offsetAtt.startOffset() + end); } private boolean isTagOpen(char c) { return c == '<' || c == '{'; } private boolean cancelBuffering(char[] chars, int len) { for (int i = 0; i < len; i++) { if (!isTagContent(chars[i])) { cacheRecoveryToken(chars, len); outputStack.addAll(recoveryStack); recoveryStack.clear(); replayToken(outputStack.poll()); clearBuffer(); return true; } } return false; } private boolean isTagContent(char c) { return c == '/' || Character.isLetterOrDigit(c); } private void replayToken(CachedToken t) { termAtt.copyBuffer(t.chars, 0, t.chars.length); termAtt.setLength(t.chars.length); offsetAtt.setOffset(t.startOffset, t.startOffset + t.chars.length); } private boolean finishBuffering(char[] chars, int len) { for (int i = 0; i < len; i++) { if (isTagClose(chars[i])) { if (i < len - 1) { // replay remainder afterwards cacheInputToken(Arrays.copyOfRange(chars, i + 1, len), offsetAtt.startOffset() + i + 1); } buffer.append(chars, 0, i + 1); return finishToken(); } } return false; } private boolean isTagClose(char c) { char open = buffer.charAt(0); return (open == '<' && c == '>') || (open == '{' && c == '}'); } private boolean finishToken() { if (buffer.length() == 0) { return false; } String token = buffer.toString(); termAtt.copyBuffer(token.toCharArray(), 0, token.length()); termAtt.setLength(token.length()); offsetAtt.setOffset(startOffset, startOffset + token.length()); clearBuffer(); recoveryStack.clear(); return true; } private void clearBuffer() { buffer = new StringBuilder(BUFFER_INITIAL_SIZE); buffering = false; } private void cacheInputToken(char[] chars, int start) { inputStack.add(new CachedToken(chars, start)); } private void cacheRecoveryToken(char[] chars, int len) { recoveryStack.add(new CachedToken(Arrays.copyOf(chars, len), offsetAtt.startOffset())); } private static class CachedToken { public final char[] chars; public final int startOffset; public CachedToken(char[] chars, int startOffset) { this.chars = chars; this.startOffset = startOffset; } } } }