TokenInfoDictionary.java example

Explorer

MoCA-master
- MoCA
  - src
    - jp
      - ac
        waseda
        info
        kake
        moca
        MoCA.java
        MocaTokenizer.java
        dict
        MocaDictionaries.java
        string
        KanaConverter.java
        Levenshtein.java
        StringSizeConverter.java
        syllable
        SyllabifiedString.java
        Syllable.java
        system
        InputMain.java
        PrintIntegerMaker.java
        tools
        MocaDictionaryBuilder.java
        MocaTokenInfoDictionaryBuilder.java
        viterbi
        MocaViterbi.java
        wordcost
        AbstractWordCostAdjuster.java
        BaseWordCostAdjuster.java
        KanaWordCostAdjuster.java
        LevenshteinWordCostAdjuster.java
        MultipliedWordCostAdjuster.java
        UnknownWordCostAdjuster.java
- TonixyKuromoji
  - src
    - TonixyKuromoji.java
    - jp
      - ac
        waseda
        info
        kake
        string
        KanaConverter.java
        Levenshtein.java
        StringSizeConverter.java
        system
        InputMain.java
        PrintIntegerMaker.java
    - org
      - atilika
        kuromoji
        DebugTokenizer.java
        Token.java
        Tokenizer.java
        TokenizerRunner.java
        dict
        CharacterDefinition.java
        ConnectionCosts.java
        Dictionaries.java
        Dictionary.java
        TokenInfoDictionary.java
        UnknownDictionary.java
        UserDictionary.java
        trie
        DoubleArrayTrie.java
        Trie.java
        util
        CSVUtil.java
        ConnectionCostsBuilder.java
        DictionaryBuilder.java
        DoubleArrayTrieBuilder.java
        TokenInfoDictionaryBuilder.java
        UnknownDictionaryBuilder.java
        viterbi
        Viterbi.java
        ViterbiFormatter.java
        ViterbiNode.java

/**
 * Copyright © 2010-2012 Atilika Inc.  All rights reserved.
 *
 * Atilika Inc. licenses this file to you under the Apache License, Version
 * 2.0 (the "License"); you may not use this file except in compliance with
 * the License.  A copy of the License is distributed with this work in the
 * LICENSE.txt file.  You may also obtain a copy of the License from
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
 * License for the specific language governing permissions and limitations
 * under the License.
 */
package org.atilika.kuromoji.dict;

import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.nio.ByteBuffer;
import java.nio.channels.Channels;
import java.nio.channels.ReadableByteChannel;
import java.nio.channels.WritableByteChannel;

import org.atilika.kuromoji.util.CSVUtil;

/**
 * 0.7.5版から変更あり。getBaseformが追加
 *
 * @author Masaru Hasegawa
 * @author Christian Moen
 */
public class TokenInfoDictionary implements Dictionary{

	public static final String FILENAME = "tid.dat";

	public static final String TARGETMAP_FILENAME = "tid_map.dat";

	protected ByteBuffer buffer;

	protected int[][] targetMap;

	public TokenInfoDictionary() {
	}

	public TokenInfoDictionary(int size) {
		targetMap = new int[1][];
		buffer = ByteBuffer.allocate(size);
	}

	/**
	 * put the entry in map
	 * @param wordId
	 * @param entry
	 * @return current position of buffer, which will be wordId of next entry
	 */
	public int put(String[] entry) {
		short leftId = Short.parseShort(entry[1]);
		short rightId = Short.parseShort(entry[2]);
		short wordCost = Short.parseShort(entry[3]);

		StringBuilder sb = new StringBuilder();
		for (int i = 4; i < entry.length; i++){
			sb.append(entry[i]).append(INTERNAL_SEPARATOR);
		}
		String features = sb.deleteCharAt(sb.length() - 1).toString();
		int featuresSize = features.length()* 2;

		// extend buffer if necessary
		int left = buffer.limit() - buffer.position();
		if (8 + featuresSize > left) { // four short and features
			ByteBuffer newBuffer = ByteBuffer.allocate(buffer.limit() * 2);
			buffer.flip();
			newBuffer.put(buffer);
			buffer = newBuffer;
		}

		buffer.putShort(leftId);
		buffer.putShort(rightId);
		buffer.putShort(wordCost);
		buffer.putShort((short)featuresSize);
		for (char c : features.toCharArray()){
			buffer.putChar(c);
		}

		return buffer.position();
	}

	public void addMapping(int sourceId, int wordId) {
		if(targetMap.length <= sourceId) {
			int[][] newArray = new int[sourceId + 1][];
			System.arraycopy(targetMap, 0, newArray, 0, targetMap.length);
			targetMap = newArray;
		}

		// Prepare array -- extend the length of array by one
		int[] current = targetMap[sourceId];
		if (current == null) {
			current = new int[1];
		} else {
			int[] newArray = new int[current.length + 1];
			System.arraycopy(current, 0, newArray, 0, current.length);
			current = newArray;
		}
		targetMap[sourceId] = current;

		int[] targets = targetMap[sourceId];
		targets[targets.length - 1] = wordId;
	}

	public int[] lookupWordIds(int sourceId) {
		return targetMap[sourceId];
	}

	@Override
	public int getLeftId(int wordId) {
		return buffer.getShort(wordId);
	}

	@Override
	public int getRightId(int wordId) {
		return buffer.getShort(wordId + 2);	// Skip left id
	}

	@Override
	public int getWordCost(int wordId) {
		return buffer.getShort(wordId + 4);	// Skip left id and right id
	}

	@Override
	public String[] getAllFeaturesArray(int wordId) {
		int size = buffer.getShort(wordId + 6) / 2; // Read length of feature String. Skip 6 bytes, see data structure.
		char[] targetArr = new char[size];
		int offset = wordId + 6 + 2; // offset is position where features string starts
		for(int i = 0; i < size; i++){
			targetArr[i] = buffer.getChar(offset + i * 2);
		}
		String allFeatures = new String(targetArr);
		return allFeatures.split(INTERNAL_SEPARATOR);
	}

	@Override
	public String getFeature(int wordId, int... fields) {
		String[] allFeatures = getAllFeaturesArray(wordId);
		StringBuilder sb = new StringBuilder();

		if(fields.length == 0){ // All features
			for(String feature : allFeatures) {
				sb.append(CSVUtil.quoteEscape(feature)).append(",");
			}
		} else if(fields.length == 1) { // One feature doesn't need to escape value
			sb.append(allFeatures[fields[0]]).append(",");
		} else {
			for(int field : fields){
				sb.append(CSVUtil.quoteEscape(allFeatures[field])).append(",");
			}
		}

		return sb.deleteCharAt(sb.length() - 1).toString();
	}

	@Override
	public String getReading(int wordId) {
		return getFeature(wordId, 7);
	}

	@Override
	public String getAllFeatures(int wordId) {
		return getFeature(wordId);
	}

	@Override
	public String getPartOfSpeech(int wordId) {
		return getFeature(wordId, 0, 1, 2, 3);
	}

	@Override
	public String getBaseForm(int wordId) {
		return getFeature(wordId, 6);
	}

	/**
	 * Write dictionary in file
	 * Dictionary format is:
	 * [Size of dictionary(int)], [entry:{left id(short)}{right id(short)}{word cost(short)}{length of pos info(short)}{pos info(char)}], [entry...], [entry...].....
	 * @param filename
	 * @throws IOException
	 */
	public void write(String directoryname) throws IOException {
		writeDictionary(directoryname + File.separator + FILENAME);
		writeTargetMap(directoryname + File.separator + TARGETMAP_FILENAME);
	}

	protected void writeTargetMap(String filename) throws IOException {
		ObjectOutputStream oos = new ObjectOutputStream(new BufferedOutputStream(new FileOutputStream(filename)));
		oos.writeObject(targetMap);
		oos.close();
	}

	protected void writeDictionary(String filename) throws IOException {
		FileOutputStream fos = new FileOutputStream(filename);
		DataOutputStream dos = new DataOutputStream(fos);
		dos.writeInt(buffer.position());
		WritableByteChannel channel = Channels.newChannel(fos);
		// Write Buffer
		buffer.flip();  // set position to 0, set limit to current position
		channel.write(buffer);

		fos.close();
	}

	/**
	 * TONIXY ファイル名を指定するための引数追加
	 *
	 * Read dictionary into directly allocated buffer.
	 * @return TokenInfoDictionary instance
	 * @throws IOException
	 * @throws ClassNotFoundException
	 */
	public static TokenInfoDictionary getInstance(String fileNamePrefix) throws IOException, ClassNotFoundException {
		TokenInfoDictionary dictionary = new TokenInfoDictionary();
		ClassLoader loader = dictionary.getClass().getClassLoader(); // 辞書ファイルのパスを取ってるだけっぽい
		dictionary.loadDictionary(loader.getResourceAsStream(fileNamePrefix + FILENAME));
		dictionary.loadTargetMap(loader.getResourceAsStream(fileNamePrefix + TARGETMAP_FILENAME));
		return dictionary;
	}

	public static TokenInfoDictionary getInstance() throws IOException, ClassNotFoundException {
		return getInstance("");
	}

	protected void loadTargetMap(InputStream is) throws IOException, ClassNotFoundException {
		ObjectInputStream ois = new ObjectInputStream(new BufferedInputStream(is));
		targetMap = (int[][]) ois.readObject(); // tid_map.dat、unk_map.datにはint[][]型のオブジェクトが入っている(BufferedOutputStreamによる出力)
		is.close();
	}

	protected void loadDictionary(InputStream is) throws IOException {
		DataInputStream dis = new DataInputStream(is);
		int size = dis.readInt();

		ByteBuffer tmpBuffer = ByteBuffer.allocateDirect(size);

		ReadableByteChannel channel = Channels.newChannel(is);
		channel.read(tmpBuffer);
		is.close();
		buffer = tmpBuffer.asReadOnlyBuffer(); //bufferを、tmpBufferと内容を共有する読み込み専用バッファとする
	}

}