/**
* Copyright © 2010-2012 Atilika Inc. All rights reserved.
*
* Atilika Inc. licenses this file to you under the Apache License, Version
* 2.0 (the "License"); you may not use this file except in compliance with
* the License. A copy of the License is distributed with this work in the
* LICENSE.txt file. You may also obtain a copy of the License from
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations
* under the License.
*/
package org.atilika.kuromoji;
import java.io.BufferedInputStream;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
import java.util.ArrayList;
import java.util.EnumMap;
import java.util.List;
import org.atilika.kuromoji.dict.Dictionaries;
import org.atilika.kuromoji.dict.Dictionary;
import org.atilika.kuromoji.dict.TokenInfoDictionary;
import org.atilika.kuromoji.dict.UnknownDictionary;
import org.atilika.kuromoji.dict.UserDictionary;
import org.atilika.kuromoji.viterbi.Viterbi;
import org.atilika.kuromoji.viterbi.ViterbiNode;
import org.atilika.kuromoji.viterbi.ViterbiNode.Type;
/**
* Tokenizer main class.
* Thread safe.
*
* @author Masaru Hasegawa
* @author Christian Moen
*/
public class Tokenizer {
public enum Mode {
NORMAL, SEARCH, EXTENDED
}
/**
* TONIXY protectedに変更
*/
protected final Viterbi viterbi;
private final EnumMap<Type, Dictionary> dictionaryMap = new EnumMap<Type, Dictionary>(Type.class);
private final boolean split;
/**
* Constructor
* TONIXY 非推奨。未知語分解モードがOFFになる。未知語分解モードの搭載後、従来通り動くためのコンストラクタ
*
* @param dictionary
* @param costs
* @param trie
* @param unkDictionary
* @param userDictionary
* @param mode
*/
/* protected Tokenizer(UserDictionary userDictionary, Mode mode, boolean split) {
this(userDictionary, mode, split, false);
}
*/
/**
* TONIXY 未知語分解モードの搭載のため、引数追加
*
* @param dictionary
* @param costs
* @param trie
* @param unkDictionary
* @param userDictionary
* @param mode
*/
protected Tokenizer(UserDictionary userDictionary, Mode mode, boolean split, boolean unknownFixMode, boolean convertsSize) {
this(new Viterbi(Dictionaries.getTrie(), Dictionaries.getDictionary(),
Dictionaries.getUnknownDictionary(), Dictionaries.getCosts(), userDictionary,
mode, unknownFixMode, convertsSize), Dictionaries.getDictionary(), Dictionaries.getUnknownDictionary(),
userDictionary, mode, split, unknownFixMode);
}
/**
* Constructor
* TONIXY Viterbiや辞書の差し替えを可能に
*
* @param viterbi
* @param dictionary
* @param unkDictionary
* @param userDictionary
* @param mode
* @param split
* @param unknownFixMode
*/
protected Tokenizer(Viterbi viterbi, TokenInfoDictionary dictionary, UnknownDictionary unkDictionary,
UserDictionary userDictionary, Mode mode, boolean split, boolean unknownFixMode) {
this.viterbi = viterbi;
this.split = split;
dictionaryMap.put(Type.KNOWN, dictionary);
dictionaryMap.put(Type.UNKNOWN, unkDictionary);
dictionaryMap.put(Type.USER, userDictionary);
}
/**
* Tokenize input text
* @param text
* @return list of Token
*/
public List<Token> tokenize(String text) {
// 。、を先に処理しない場合
if (!split) {
return doTokenize(0, text);
}
// 。、の位置を取得
List<Integer> splitPositions = getSplitPositions(text);
if(splitPositions.size() == 0) {
return doTokenize(0, text);
}
// 。、の後で区切って処理していく(前で区切って。、のみのトークンを作ったりはしない)
ArrayList<Token> result = new ArrayList<Token>();
int offset = 0;
for(int position : splitPositions) {
result.addAll(doTokenize(offset, text.substring(offset, position + 1)));
offset = position + 1;
}
// 最後の。、以降の解析
if(offset < text.length()) {
result.addAll(doTokenize(offset, text.substring(offset)));
}
return result;
}
/**
* Split input text at 句読点, which is 。 and 、
* @param text
* @return list of split position
*/
private List<Integer> getSplitPositions(String text) {
ArrayList<Integer> splitPositions = new ArrayList<Integer>();
int position = 0;
int currentPosition = 0;
while(true) {
int indexOfMaru = text.indexOf("。", currentPosition);
int indexOfTen = text.indexOf("、", currentPosition);
if(indexOfMaru < 0 || indexOfTen < 0) {
position = Math.max(indexOfMaru, indexOfTen);;
} else {
position = Math.min(indexOfMaru, indexOfTen);
}
if(position >= 0) {
splitPositions.add(position);
currentPosition = position + 1;
} else {
break;
}
}
return splitPositions;
}
/**
* Tokenize input sentence.
* 実際にはsearchで得たViterbiNode列からTokenを作ってArrayListに入れて返す
*
* @param offset offset of sentence in original input text
* @param sentence sentence to tokenize
* @return list of Token
*/
private List<Token> doTokenize(int offset, String sentence) {
ArrayList<Token> result = new ArrayList<Token>();
ViterbiNode[][][] lattice = viterbi.build(sentence);
List<ViterbiNode> bestPath = viterbi.search(lattice);
for (ViterbiNode node : bestPath) {
int wordId = node.getWordId();
if (node.getType() == Type.KNOWN && wordId == -1) { // Do not include BOS/EOS
continue;
}
Token token = new Token(wordId, node.getSurfaceForm(), node.getType(), offset + node.getStartIndex(), dictionaryMap.get(node.getType())); // Pass different dictionary based on the type of node
result.add(token);
}
return result;
}
/**
* Get Builder to create Tokenizer instance.
* @return Builder
*/
public static Builder builder() {
return new Builder();
}
/**
* ユーザー辞書を変更します。TONIXY とにぃによる追記。
*
* @param userDictionary
*/
public void setUserDictionary(UserDictionary userDictionary) {
dictionaryMap.put(Type.USER, userDictionary);
viterbi.setUserDictionary(userDictionary);
}
/**
* Builder class used to create Tokenizer instance.
*/
public static class Builder {
private Mode mode = Mode.NORMAL;
private boolean split = true;
/**
* TONIXY 未知語分解モードのON/OFF。未知語の部分文字列をLatticeに追加するかどうか。trueに設定することを推奨
*/
private boolean unknownFixMode = false;
/**
* TONIXY 全角化のON/OFF。trueに設定することを推奨
*/
private boolean convertsSize = false;
private UserDictionary userDictionary = null;
/**
* Set tokenization mode
* Default: NORMAL
*
* モード設定 Default: NORMAL
*
* @param mode tokenization mode
* @return Builder
*/
public synchronized Builder mode(Mode mode) {
this.mode = mode;
return this;
}
/**
* Set if tokenizer should split input string at "。" and "、" before tokenize to increase performance.
* Splitting shouldn't change the result of tokenization most of the cases.
* Default: true
*
* 先に、。で区切ってパフォーマンスを上げるか否か
*
* @param split whether tokenizer should split input string
* @return Builder
*/
public synchronized Builder split(boolean split) {
this.split = split;
return this;
}
/**
* TONIXY 全角化のON/OFFを設定します。Kuromojiの挙動を保持するためデフォルトはOFFですが、ONにしておくことを推奨します。
*
* ON(true)にすると、辞書から単語を検索する際に、自動的に全角文字に変換して検索します。アルファベットや記号を認識しやすくなります。
*
* @param convertsSize
* @return
*/
public synchronized Builder convertsSize(boolean convertsSize) {
this.convertsSize = convertsSize;
return this;
}
/**
* TONIXY 未知語分解モードのON/OFFを設定します。Kuromojiの挙動を保持するためデフォルトはOFFですが、ONにしておくことを推奨します。
*
* ON(true)にすると、未知語の部分文字列をLatticeに追加することで、未知語部分をできるだけ短くすることができます。
* また、未知語直後にユーザ辞書内単語が来た時、ユーザ辞書内単語以前の結果が消滅する現象も回避できます。
*
* @param unknownFixMode
* @return
*/
public synchronized Builder unknownFixMode(boolean unknownFixMode) {
this.unknownFixMode = unknownFixMode;
return this;
}
/**
* Set user dictionary input stream
* @param userDictionaryInputStream dictionary file as input stream
* @return Builder
* @throws IOException
*/
public synchronized Builder userDictionary(InputStream userDictionaryInputStream) throws IOException {
this.userDictionary = UserDictionary.read(userDictionaryInputStream);
return this;
}
/**
* TONIXY ユーザ辞書読込の自由度を上げるため、Reader型版を追加
*
* @param userDictionaryReader
* @return
* @throws IOException
*/
public synchronized Builder userDictionary(Reader userDictionaryReader)
throws IOException {
this.userDictionary = UserDictionary.read(userDictionaryReader);
return this;
}
/**
* Set user dictionary path
* @param userDictionaryPath path to dictionary file
* @return Builder
* @throws IOException
* @throws FileNotFoundException
*/
public synchronized Builder userDictionary(String userDictionaryPath) throws FileNotFoundException, IOException {
if (userDictionaryPath != null && ! userDictionaryPath.isEmpty()) {
this.userDictionary(new BufferedInputStream(new FileInputStream(userDictionaryPath)));
}
return this;
}
/**
* Create Tokenizer instance
* @return Tokenizer
*/
public synchronized Tokenizer build() {
return new Tokenizer(userDictionary, mode, split, unknownFixMode, convertsSize);
}
}
}