DoubleArrayTrie.java example

Explorer

MoCA-master
- MoCA
  - src
    - jp
      - ac
        waseda
        info
        kake
        moca
        MoCA.java
        MocaTokenizer.java
        dict
        MocaDictionaries.java
        string
        KanaConverter.java
        Levenshtein.java
        StringSizeConverter.java
        syllable
        SyllabifiedString.java
        Syllable.java
        system
        InputMain.java
        PrintIntegerMaker.java
        tools
        MocaDictionaryBuilder.java
        MocaTokenInfoDictionaryBuilder.java
        viterbi
        MocaViterbi.java
        wordcost
        AbstractWordCostAdjuster.java
        BaseWordCostAdjuster.java
        KanaWordCostAdjuster.java
        LevenshteinWordCostAdjuster.java
        MultipliedWordCostAdjuster.java
        UnknownWordCostAdjuster.java
- TonixyKuromoji
  - src
    - TonixyKuromoji.java
    - jp
      - ac
        waseda
        info
        kake
        string
        KanaConverter.java
        Levenshtein.java
        StringSizeConverter.java
        system
        InputMain.java
        PrintIntegerMaker.java
    - org
      - atilika
        kuromoji
        DebugTokenizer.java
        Token.java
        Tokenizer.java
        TokenizerRunner.java
        dict
        CharacterDefinition.java
        ConnectionCosts.java
        Dictionaries.java
        Dictionary.java
        TokenInfoDictionary.java
        UnknownDictionary.java
        UserDictionary.java
        trie
        DoubleArrayTrie.java
        Trie.java
        util
        CSVUtil.java
        ConnectionCostsBuilder.java
        DictionaryBuilder.java
        DoubleArrayTrieBuilder.java
        TokenInfoDictionaryBuilder.java
        UnknownDictionaryBuilder.java
        viterbi
        Viterbi.java
        ViterbiFormatter.java
        ViterbiNode.java

/**
 * Copyright © 2010-2012 Atilika Inc.  All rights reserved.
 *
 * Atilika Inc. licenses this file to you under the Apache License, Version
 * 2.0 (the "License"); you may not use this file except in compliance with
 * the License.  A copy of the License is distributed with this work in the
 * LICENSE.txt file.  You may also obtain a copy of the License from
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
 * License for the specific language governing permissions and limitations
 * under the License.
 */
package org.atilika.kuromoji.trie;

import java.io.BufferedInputStream;
import java.io.DataInputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.RandomAccessFile;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.IntBuffer;
import java.nio.channels.Channels;
import java.nio.channels.FileChannel;
import java.nio.channels.ReadableByteChannel;

import org.atilika.kuromoji.trie.Trie.Node;


/**
 * @author Masaru Hasegawa
 * @author Christian Moen
 */
public class DoubleArrayTrie {

	public static final String FILENAME = "dat.dat";

	public static final char TERMINATING_CHARACTER = '\u0001';

	private static final int BASE_CHECK_INITILAL_SIZE = 1000000;

	private static final int TAIL_INITIAL_SIZE = 10000;

	private static final int TAIL_OFFSET = 10000000;

	private IntBuffer baseBuffer;

	private IntBuffer checkBuffer;

	private CharBuffer tailBuffer;

	private int tailIndex = TAIL_OFFSET;

	public DoubleArrayTrie() {

	}

	/**
	 * Write to file
	 * @param filename filename
	 * @throws IOException
	 */
	public void write(String directoryname) throws IOException  {
		String filename = directoryname + File.separator + FILENAME;

		baseBuffer.rewind();
		checkBuffer.rewind();
		tailBuffer.rewind();

		File file = new File(filename);
		if(file.exists()){
			file.delete();
		}

		RandomAccessFile raf = new RandomAccessFile(filename, "rw");
		FileChannel channel = raf.getChannel();
		raf.writeInt(baseBuffer.capacity());
		raf.writeInt(tailBuffer.capacity());

		ByteBuffer tmpBuffer = ByteBuffer.allocate(baseBuffer.capacity() * 4);
		IntBuffer tmpIntBuffer = tmpBuffer.asIntBuffer();
		tmpIntBuffer.put(baseBuffer);
		tmpBuffer.rewind();
		channel.write(tmpBuffer);

		tmpBuffer = ByteBuffer.allocate(checkBuffer.capacity() * 4);
		tmpIntBuffer = tmpBuffer.asIntBuffer();
		tmpIntBuffer.put(checkBuffer);
		tmpBuffer.rewind();
		channel.write(tmpBuffer);

		tmpBuffer = ByteBuffer.allocate(tailBuffer.capacity() * 2);
		CharBuffer tmpCharBuffer = tmpBuffer.asCharBuffer();
		tmpCharBuffer.put(tailBuffer);
		tmpBuffer.rewind();
		channel.write(tmpBuffer);

		raf.close();
	}

	// TONIXY ファイル名を指定するための引数追加
	public static DoubleArrayTrie getInstance(String fileNamePrefix) throws IOException {
		InputStream is = DoubleArrayTrie.class.getClassLoader().getResourceAsStream(fileNamePrefix + FILENAME);
		return read(is);
	}

	public static DoubleArrayTrie getInstance() throws IOException {
		return getInstance("");
	}

	/**
	 * Load Stored data
	 * @param is
	 * @return
	 * @throws IOException
	 */
	public static DoubleArrayTrie read(InputStream is) throws IOException {
		DoubleArrayTrie trie = new DoubleArrayTrie();
		DataInputStream dis = new DataInputStream(new BufferedInputStream(is));
		int baseCheckSize = dis.readInt();	// Read size of baseArr and checkArr
		int tailSize = dis.readInt();		// Read size of tailArr
		ReadableByteChannel channel = Channels.newChannel(dis);

		//dat.datには、baseBufferおよびcheckBufferのサイズ(実際には4倍のビット数の配列が確保される。int型だからか)、
		//tailBufferのサイズ(実際には2倍のビット数の配列が確保される。char型だからか)、それぞれのデータの順で入っている


		ByteBuffer tmpBaseBuffer = ByteBuffer.allocateDirect(baseCheckSize * 4);	// The size is 4 times the baseCheckSize since it is the length of array
		channel.read(tmpBaseBuffer);
		tmpBaseBuffer.rewind();
		trie.baseBuffer = tmpBaseBuffer.asIntBuffer().asReadOnlyBuffer();

		ByteBuffer tmpCheckBuffer = ByteBuffer.allocateDirect(baseCheckSize * 4);
		channel.read(tmpCheckBuffer);
		tmpCheckBuffer.rewind();
		trie.checkBuffer = tmpCheckBuffer.asIntBuffer().asReadOnlyBuffer();

		ByteBuffer tmpTailBuffer = ByteBuffer.allocateDirect(tailSize * 2);			// The size is 2 times the tailSize since it is the length of array
		channel.read(tmpTailBuffer);
		tmpTailBuffer.rewind();
		trie.tailBuffer = tmpTailBuffer.asCharBuffer().asReadOnlyBuffer();

		is.close();
		return trie;
	}

	/**
	 * Construct double array trie which is equivalent to input trie
	 * @param trie normal trie which contains all dictionary words
	 */
	public void build(Trie trie) {
		baseBuffer = ByteBuffer.allocate(BASE_CHECK_INITILAL_SIZE * 4).asIntBuffer();
		baseBuffer.put(0, 1);
		checkBuffer = ByteBuffer.allocate(BASE_CHECK_INITILAL_SIZE * 4).asIntBuffer();
		tailBuffer = ByteBuffer.allocate(TAIL_INITIAL_SIZE * 2).asCharBuffer();
		add(-1, 0, trie.getRoot());
	}

	/**
	 * Add Node(character) to double array trie
	 * @param previous
	 * @param index
	 * @param node
	 */
	private void add(int previous, int index, Node node) {
		Node[] children = node.getChildren();	// nodes following current node

		if(node.getChildren().length > 0 && node.hasSinglePath() && node.getChildren()[0].getKey() != TERMINATING_CHARACTER) {	// If node has only one path, put the rest in tail array
			baseBuffer.put(index, tailIndex);	// current index of tail array
			addToTail(node.children[0]);
			checkBuffer.put(index, previous);
			return;	// No more child to process
		}

		int base = findBase(index, children);	// Get base value for current index
		baseBuffer.put(index, base);

		if(previous >= 0){
			checkBuffer.put(index, previous);	// Set check value
		}

		for(Trie.Node child : children) {	// For each child to double array trie
			add(index, index + base + child.getKey(), child);
		}

	}

	/**
	 * Match input keyword.
	 * @param key key to match
	 * @return index value of last character in baseBuffer(double array id) if it is complete match. Negative value if it doesn't match. 0 if it is prefix match.
	 */
	public int lookup(String key) {
		int index = 0;
		int base = 1; // base at index 0 should be 1

		int keyLength = key.length();
		for(int i = 0; i < keyLength; i++) {
			int previous = index;
			index = index + base + key.charAt(i);

			if(index > baseBuffer.limit()) { // Too long
				return -1;
			}

			base = baseBuffer.get(index);

			if (base == 0 ) { // Didn't find match
				return -1;
			}

			if(checkBuffer.get(index) != previous){	// check doesn't match
				return -1;
			}

			if(base >= TAIL_OFFSET) {	// If base is bigger than TAIL_OFFSET, start processing "tail"
				return matchTail(base, index, key.substring(i + 1));
			}

		}

		// If we reach at the end of input keyword, check if it is complete match by looking for following terminating character
		int endIndex = index + base + TERMINATING_CHARACTER;

		return checkBuffer.get(endIndex) == index ? index : 0;
	}

	/**
	 * Check match in tail array
	 * @param base
	 * @param index
	 * @param key
	 * @return	index if it is complete match. 0 if it is prefix match. negative value if it doesn't match
	 */
	private int matchTail(int base, int index, String key) {
		int positionInTailArr = base - TAIL_OFFSET;

		int keyLength = key.length();
		for(int i = 0; i < keyLength; i++) {
			if(key.charAt(i) != tailBuffer.get(positionInTailArr + i)){
				return -1;
			}
		}
		return tailBuffer.get(positionInTailArr + keyLength) == TERMINATING_CHARACTER ? index : 0;

	}

	/**
	 * Find base value for current node, which contains input nodes. They are children of current node.
	 * Set default base value , which is one, at the index of each input node.
	 * @param index
	 * @param nodes
	 * @return	base value for current node
	 */
	private int findBase(int index, Node[] nodes){
		int base = baseBuffer.get(index);
		if(base < 0) {
			return base;
		}

		while(true) {
			boolean collision = false;	// already taken?
			for(Node node : nodes) {
				/*
				 * NOTE:
				 * Originally, nextIndex is base + node.getKey(). But to reduce construction time, we use index + base + node.getKey().
				 * However, this makes array bigger. If there is a need to compat the file dat.dat, it's possbile to modify here and there.
				 * Although the size of jar file doesn't change, memory consumption will be smaller.
				 */
				int nextIndex = index + base + node.getKey();

				if(baseBuffer.capacity() <= nextIndex) {
					int newLength = nextIndex + 1;
					IntBuffer newBaseBuffer = ByteBuffer.allocate(newLength * 4).asIntBuffer();
					baseBuffer.rewind();
					newBaseBuffer.put(baseBuffer);
					baseBuffer = newBaseBuffer;
					IntBuffer newCheckBuffer = ByteBuffer.allocate(newLength * 4).asIntBuffer();
					checkBuffer.rewind();
					newCheckBuffer.put(checkBuffer);
					checkBuffer = newCheckBuffer;
				}

				if(baseBuffer.get(nextIndex) != 0) {	// already taken
					base++;	// check next base value
					collision = true;
					break;
				}
			}

			if(!collision){
				break;	// if there is no collision, found proper base value. Break the while loop.
			}

		}

		for(Node node : nodes) {
			baseBuffer.put(index + base + node.getKey(), node.getKey() == TERMINATING_CHARACTER ? -1 : 1);	// Set -1 if key is terminating character. Set default base value 1 if not.
		}

		return base;
	}

	/**
	 * Add characters(nodes) to tail array
	 * @param node
	 */
	private void addToTail(Node node) {
		while(true) {
			if(tailBuffer.capacity() < tailIndex - TAIL_OFFSET + 1){
				CharBuffer newTailBuffer = ByteBuffer.allocate((tailBuffer.capacity() + TAIL_INITIAL_SIZE / 100) * 2).asCharBuffer();
				tailBuffer.rewind();
				newTailBuffer.put(tailBuffer);
				tailBuffer = newTailBuffer;
			}
			tailBuffer.put(tailIndex++ - TAIL_OFFSET, node.getKey());// set character of current node

			if(node.getChildren().length == 0) {	// if it reached the end of input, break.
				break;
			}
			node = node.getChildren()[0];	// Move to next node
		}
	}
}