Tokenizer.java example

Explorer

MoCA-master
- MoCA
  - src
    - jp
      - ac
        waseda
        info
        kake
        moca
        MoCA.java
        MocaTokenizer.java
        dict
        MocaDictionaries.java
        string
        KanaConverter.java
        Levenshtein.java
        StringSizeConverter.java
        syllable
        SyllabifiedString.java
        Syllable.java
        system
        InputMain.java
        PrintIntegerMaker.java
        tools
        MocaDictionaryBuilder.java
        MocaTokenInfoDictionaryBuilder.java
        viterbi
        MocaViterbi.java
        wordcost
        AbstractWordCostAdjuster.java
        BaseWordCostAdjuster.java
        KanaWordCostAdjuster.java
        LevenshteinWordCostAdjuster.java
        MultipliedWordCostAdjuster.java
        UnknownWordCostAdjuster.java
- TonixyKuromoji
  - src
    - TonixyKuromoji.java
    - jp
      - ac
        waseda
        info
        kake
        string
        KanaConverter.java
        Levenshtein.java
        StringSizeConverter.java
        system
        InputMain.java
        PrintIntegerMaker.java
    - org
      - atilika
        kuromoji
        DebugTokenizer.java
        Token.java
        Tokenizer.java
        TokenizerRunner.java
        dict
        CharacterDefinition.java
        ConnectionCosts.java
        Dictionaries.java
        Dictionary.java
        TokenInfoDictionary.java
        UnknownDictionary.java
        UserDictionary.java
        trie
        DoubleArrayTrie.java
        Trie.java
        util
        CSVUtil.java
        ConnectionCostsBuilder.java
        DictionaryBuilder.java
        DoubleArrayTrieBuilder.java
        TokenInfoDictionaryBuilder.java
        UnknownDictionaryBuilder.java
        viterbi
        Viterbi.java
        ViterbiFormatter.java
        ViterbiNode.java

/**
 * Copyright © 2010-2012 Atilika Inc.  All rights reserved.
 *
 * Atilika Inc. licenses this file to you under the Apache License, Version
 * 2.0 (the "License"); you may not use this file except in compliance with
 * the License.  A copy of the License is distributed with this work in the
 * LICENSE.txt file.  You may also obtain a copy of the License from
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
 * License for the specific language governing permissions and limitations
 * under the License.
 */
package org.atilika.kuromoji;

import java.io.BufferedInputStream;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
import java.util.ArrayList;
import java.util.EnumMap;
import java.util.List;

import org.atilika.kuromoji.dict.Dictionaries;
import org.atilika.kuromoji.dict.Dictionary;
import org.atilika.kuromoji.dict.TokenInfoDictionary;
import org.atilika.kuromoji.dict.UnknownDictionary;
import org.atilika.kuromoji.dict.UserDictionary;
import org.atilika.kuromoji.viterbi.Viterbi;
import org.atilika.kuromoji.viterbi.ViterbiNode;
import org.atilika.kuromoji.viterbi.ViterbiNode.Type;

/**
 * Tokenizer main class.
 * Thread safe.
 *
 * @author Masaru Hasegawa
 * @author Christian Moen
 */
public class Tokenizer {
	public enum Mode {
		NORMAL, SEARCH, EXTENDED
	}

	/**
	 * TONIXY protectedに変更
	 */
	protected final Viterbi viterbi;

	private final EnumMap<Type, Dictionary> dictionaryMap = new EnumMap<Type, Dictionary>(Type.class);

	private final boolean split;

	/**
	 * Constructor
	 * TONIXY 非推奨。未知語分解モードがOFFになる。未知語分解モードの搭載後、従来通り動くためのコンストラクタ
	 *
	 * @param dictionary
	 * @param costs
	 * @param trie
	 * @param unkDictionary
	 * @param userDictionary
	 * @param mode
	 */
/*	protected Tokenizer(UserDictionary userDictionary, Mode mode, boolean split) {
		this(userDictionary, mode, split, false);
	}
*/
	/**
	 * TONIXY 未知語分解モードの搭載のため、引数追加
	 *
	 * @param dictionary
	 * @param costs
	 * @param trie
	 * @param unkDictionary
	 * @param userDictionary
	 * @param mode
	 */
	protected Tokenizer(UserDictionary userDictionary, Mode mode, boolean split, boolean unknownFixMode, boolean convertsSize) {
		this(new Viterbi(Dictionaries.getTrie(), Dictionaries.getDictionary(),
                Dictionaries.getUnknownDictionary(), Dictionaries.getCosts(), userDictionary,
                mode, unknownFixMode, convertsSize), Dictionaries.getDictionary(), Dictionaries.getUnknownDictionary(),
                userDictionary, mode, split, unknownFixMode);
	}

	/**
	 * Constructor
	 * TONIXY Viterbiや辞書の差し替えを可能に
	 *
	 * @param viterbi
	 * @param dictionary
	 * @param unkDictionary
	 * @param userDictionary
	 * @param mode
	 * @param split
	 * @param unknownFixMode
	 */
	protected Tokenizer(Viterbi viterbi, TokenInfoDictionary dictionary, UnknownDictionary unkDictionary,
			UserDictionary userDictionary, Mode mode, boolean split, boolean unknownFixMode) {

		this.viterbi = viterbi;
		this.split = split;

		dictionaryMap.put(Type.KNOWN, dictionary);
		dictionaryMap.put(Type.UNKNOWN, unkDictionary);
		dictionaryMap.put(Type.USER, userDictionary);
	}

	/**
	 * Tokenize input text
	 * @param text
	 * @return list of Token
	 */
	public List<Token> tokenize(String text) {

		// 。、を先に処理しない場合
		if (!split) {
			return doTokenize(0, text);
		}

		// 。、の位置を取得
		List<Integer> splitPositions = getSplitPositions(text);

		if(splitPositions.size() == 0) {
			return doTokenize(0, text);
		}

		// 。、の後で区切って処理していく(前で区切って。、のみのトークンを作ったりはしない)
		ArrayList<Token> result = new ArrayList<Token>();
		int offset = 0;
		for(int position : splitPositions) {
			result.addAll(doTokenize(offset, text.substring(offset, position + 1)));
			offset = position + 1;
		}

		// 最後の。、以降の解析
		if(offset < text.length()) {
			result.addAll(doTokenize(offset, text.substring(offset)));
		}

		return result;
	}

	/**
	 * Split input text at 句読点, which is 。 and 、
	 * @param text
	 * @return list of split position
	 */
	private List<Integer> getSplitPositions(String text) {
		ArrayList<Integer> splitPositions = new ArrayList<Integer>();

		int position = 0;
		int currentPosition = 0;

		while(true) {
			int indexOfMaru = text.indexOf("。", currentPosition);
			int indexOfTen = text.indexOf("、", currentPosition);

			if(indexOfMaru < 0 || indexOfTen < 0) {
				position = Math.max(indexOfMaru, indexOfTen);;
			} else {
				position = Math.min(indexOfMaru, indexOfTen);
			}

			if(position >= 0) {
				splitPositions.add(position);
				currentPosition = position + 1;
			} else {
				break;
			}
		}

		return splitPositions;
	}

	/**
	 * Tokenize input sentence.
	 * 実際にはsearchで得たViterbiNode列からTokenを作ってArrayListに入れて返す
	 *
	 * @param offset offset of sentence in original input text
	 * @param sentence sentence to tokenize
	 * @return list of Token
	 */
	private List<Token> doTokenize(int offset, String sentence) {
		ArrayList<Token> result = new ArrayList<Token>();

		ViterbiNode[][][] lattice = viterbi.build(sentence);
		List<ViterbiNode> bestPath = viterbi.search(lattice);
		for (ViterbiNode node : bestPath) {
			int wordId = node.getWordId();
			if (node.getType() == Type.KNOWN && wordId == -1) { // Do not include BOS/EOS
				continue;
			}
			Token token = new Token(wordId, node.getSurfaceForm(), node.getType(), offset + node.getStartIndex(), dictionaryMap.get(node.getType()));	// Pass different dictionary based on the type of node
			result.add(token);
		}

		return result;
	}

	/**
	 * Get Builder to create Tokenizer instance.
	 * @return Builder
	 */
	public static Builder builder() {
		return new Builder();
	}

	/**
	 * ユーザー辞書を変更します。TONIXY とにぃによる追記。
	 *
	 * @param userDictionary
	 */
	public void setUserDictionary(UserDictionary userDictionary) {
		dictionaryMap.put(Type.USER, userDictionary);
		viterbi.setUserDictionary(userDictionary);
	}

	/**
	 * Builder class used to create Tokenizer instance.
	 */
	public static class Builder {

		private Mode mode = Mode.NORMAL;

		private boolean split = true;

		/**
		 * TONIXY 未知語分解モードのON/OFF。未知語の部分文字列をLatticeに追加するかどうか。trueに設定することを推奨
		 */
		private boolean unknownFixMode = false;

		/**
		 * TONIXY 全角化のON/OFF。trueに設定することを推奨
		 */
		private boolean convertsSize = false;

		private UserDictionary userDictionary = null;

		/**
		 * Set tokenization mode
		 * Default: NORMAL
		 *
		 * モード設定 Default: NORMAL
		 *
		 * @param mode tokenization mode
		 * @return Builder
		 */
		public synchronized Builder mode(Mode mode) {
			this.mode = mode;
			return this;
		}

		/**
		 * Set if tokenizer should split input string at "。" and "、" before tokenize to increase performance.
		 * Splitting shouldn't change the result of tokenization most of the cases.
		 * Default: true
		 *
		 * 先に、。で区切ってパフォーマンスを上げるか否か
		 *
		 * @param split whether tokenizer should split input string
		 * @return Builder
		 */
		public synchronized Builder split(boolean split) {
			this.split = split;
			return this;
		}

		/**
		 * TONIXY 全角化のON/OFFを設定します。Kuromojiの挙動を保持するためデフォルトはOFFですが、ONにしておくことを推奨します。
		 *
		 * ON(true)にすると、辞書から単語を検索する際に、自動的に全角文字に変換して検索します。アルファベットや記号を認識しやすくなります。
		 *
		 * @param convertsSize
		 * @return
		 */
		public synchronized Builder convertsSize(boolean convertsSize) {
			this.convertsSize = convertsSize;
			return this;
		}

		/**
		 * TONIXY 未知語分解モードのON/OFFを設定します。Kuromojiの挙動を保持するためデフォルトはOFFですが、ONにしておくことを推奨します。
		 *
		 * ON(true)にすると、未知語の部分文字列をLatticeに追加することで、未知語部分をできるだけ短くすることができます。
		 * また、未知語直後にユーザ辞書内単語が来た時、ユーザ辞書内単語以前の結果が消滅する現象も回避できます。
		 *
		 * @param unknownFixMode
		 * @return
		 */
		public synchronized Builder unknownFixMode(boolean unknownFixMode) {
			this.unknownFixMode = unknownFixMode;
			return this;
		}

		/**
		 * Set user dictionary input stream
		 * @param userDictionaryInputStream dictionary file as input stream
		 * @return Builder
		 * @throws IOException
		 */
		public synchronized Builder userDictionary(InputStream userDictionaryInputStream) throws IOException {
			this.userDictionary = UserDictionary.read(userDictionaryInputStream);
			return this;
		}

		/**
		 * TONIXY ユーザ辞書読込の自由度を上げるため、Reader型版を追加
		 *
		 * @param userDictionaryReader
		 * @return
		 * @throws IOException
		 */
		public synchronized Builder userDictionary(Reader userDictionaryReader)
				throws IOException {
			this.userDictionary = UserDictionary.read(userDictionaryReader);
			return this;
		}

		/**
		 * Set user dictionary path
		 * @param userDictionaryPath path to dictionary file
		 * @return Builder
		 * @throws IOException
		 * @throws FileNotFoundException
		 */
		public synchronized Builder userDictionary(String userDictionaryPath) throws FileNotFoundException, IOException {
			if (userDictionaryPath != null && ! userDictionaryPath.isEmpty()) {
				this.userDictionary(new BufferedInputStream(new FileInputStream(userDictionaryPath)));
			}
			return this;
		}

		/**
		 * Create Tokenizer instance
		 * @return Tokenizer
		 */
		public synchronized Tokenizer build() {
			return new Tokenizer(userDictionary, mode, split, unknownFixMode, convertsSize);
		}
	}
}