/**
* Copyright © 2010-2012 Atilika Inc. All rights reserved.
*
* Atilika Inc. licenses this file to you under the Apache License, Version
* 2.0 (the "License"); you may not use this file except in compliance with
* the License. A copy of the License is distributed with this work in the
* LICENSE.txt file. You may also obtain a copy of the License from
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations
* under the License.
*/
package org.atilika.kuromoji.trie;
import java.io.BufferedInputStream;
import java.io.DataInputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.RandomAccessFile;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.IntBuffer;
import java.nio.channels.Channels;
import java.nio.channels.FileChannel;
import java.nio.channels.ReadableByteChannel;
import org.atilika.kuromoji.trie.Trie.Node;
/**
* @author Masaru Hasegawa
* @author Christian Moen
*/
public class DoubleArrayTrie {
public static final String FILENAME = "dat.dat";
public static final char TERMINATING_CHARACTER = '\u0001';
private static final int BASE_CHECK_INITILAL_SIZE = 1000000;
private static final int TAIL_INITIAL_SIZE = 10000;
private static final int TAIL_OFFSET = 10000000;
private IntBuffer baseBuffer;
private IntBuffer checkBuffer;
private CharBuffer tailBuffer;
private int tailIndex = TAIL_OFFSET;
public DoubleArrayTrie() {
}
/**
* Write to file
* @param filename filename
* @throws IOException
*/
public void write(String directoryname) throws IOException {
String filename = directoryname + File.separator + FILENAME;
baseBuffer.rewind();
checkBuffer.rewind();
tailBuffer.rewind();
File file = new File(filename);
if(file.exists()){
file.delete();
}
RandomAccessFile raf = new RandomAccessFile(filename, "rw");
FileChannel channel = raf.getChannel();
raf.writeInt(baseBuffer.capacity());
raf.writeInt(tailBuffer.capacity());
ByteBuffer tmpBuffer = ByteBuffer.allocate(baseBuffer.capacity() * 4);
IntBuffer tmpIntBuffer = tmpBuffer.asIntBuffer();
tmpIntBuffer.put(baseBuffer);
tmpBuffer.rewind();
channel.write(tmpBuffer);
tmpBuffer = ByteBuffer.allocate(checkBuffer.capacity() * 4);
tmpIntBuffer = tmpBuffer.asIntBuffer();
tmpIntBuffer.put(checkBuffer);
tmpBuffer.rewind();
channel.write(tmpBuffer);
tmpBuffer = ByteBuffer.allocate(tailBuffer.capacity() * 2);
CharBuffer tmpCharBuffer = tmpBuffer.asCharBuffer();
tmpCharBuffer.put(tailBuffer);
tmpBuffer.rewind();
channel.write(tmpBuffer);
raf.close();
}
// TONIXY ファイル名を指定するための引数追加
public static DoubleArrayTrie getInstance(String fileNamePrefix) throws IOException {
InputStream is = DoubleArrayTrie.class.getClassLoader().getResourceAsStream(fileNamePrefix + FILENAME);
return read(is);
}
public static DoubleArrayTrie getInstance() throws IOException {
return getInstance("");
}
/**
* Load Stored data
* @param is
* @return
* @throws IOException
*/
public static DoubleArrayTrie read(InputStream is) throws IOException {
DoubleArrayTrie trie = new DoubleArrayTrie();
DataInputStream dis = new DataInputStream(new BufferedInputStream(is));
int baseCheckSize = dis.readInt(); // Read size of baseArr and checkArr
int tailSize = dis.readInt(); // Read size of tailArr
ReadableByteChannel channel = Channels.newChannel(dis);
//dat.datには、baseBufferおよびcheckBufferのサイズ(実際には4倍のビット数の配列が確保される。int型だからか)、
//tailBufferのサイズ(実際には2倍のビット数の配列が確保される。char型だからか)、それぞれのデータの順で入っている
ByteBuffer tmpBaseBuffer = ByteBuffer.allocateDirect(baseCheckSize * 4); // The size is 4 times the baseCheckSize since it is the length of array
channel.read(tmpBaseBuffer);
tmpBaseBuffer.rewind();
trie.baseBuffer = tmpBaseBuffer.asIntBuffer().asReadOnlyBuffer();
ByteBuffer tmpCheckBuffer = ByteBuffer.allocateDirect(baseCheckSize * 4);
channel.read(tmpCheckBuffer);
tmpCheckBuffer.rewind();
trie.checkBuffer = tmpCheckBuffer.asIntBuffer().asReadOnlyBuffer();
ByteBuffer tmpTailBuffer = ByteBuffer.allocateDirect(tailSize * 2); // The size is 2 times the tailSize since it is the length of array
channel.read(tmpTailBuffer);
tmpTailBuffer.rewind();
trie.tailBuffer = tmpTailBuffer.asCharBuffer().asReadOnlyBuffer();
is.close();
return trie;
}
/**
* Construct double array trie which is equivalent to input trie
* @param trie normal trie which contains all dictionary words
*/
public void build(Trie trie) {
baseBuffer = ByteBuffer.allocate(BASE_CHECK_INITILAL_SIZE * 4).asIntBuffer();
baseBuffer.put(0, 1);
checkBuffer = ByteBuffer.allocate(BASE_CHECK_INITILAL_SIZE * 4).asIntBuffer();
tailBuffer = ByteBuffer.allocate(TAIL_INITIAL_SIZE * 2).asCharBuffer();
add(-1, 0, trie.getRoot());
}
/**
* Add Node(character) to double array trie
* @param previous
* @param index
* @param node
*/
private void add(int previous, int index, Node node) {
Node[] children = node.getChildren(); // nodes following current node
if(node.getChildren().length > 0 && node.hasSinglePath() && node.getChildren()[0].getKey() != TERMINATING_CHARACTER) { // If node has only one path, put the rest in tail array
baseBuffer.put(index, tailIndex); // current index of tail array
addToTail(node.children[0]);
checkBuffer.put(index, previous);
return; // No more child to process
}
int base = findBase(index, children); // Get base value for current index
baseBuffer.put(index, base);
if(previous >= 0){
checkBuffer.put(index, previous); // Set check value
}
for(Trie.Node child : children) { // For each child to double array trie
add(index, index + base + child.getKey(), child);
}
}
/**
* Match input keyword.
* @param key key to match
* @return index value of last character in baseBuffer(double array id) if it is complete match. Negative value if it doesn't match. 0 if it is prefix match.
*/
public int lookup(String key) {
int index = 0;
int base = 1; // base at index 0 should be 1
int keyLength = key.length();
for(int i = 0; i < keyLength; i++) {
int previous = index;
index = index + base + key.charAt(i);
if(index > baseBuffer.limit()) { // Too long
return -1;
}
base = baseBuffer.get(index);
if (base == 0 ) { // Didn't find match
return -1;
}
if(checkBuffer.get(index) != previous){ // check doesn't match
return -1;
}
if(base >= TAIL_OFFSET) { // If base is bigger than TAIL_OFFSET, start processing "tail"
return matchTail(base, index, key.substring(i + 1));
}
}
// If we reach at the end of input keyword, check if it is complete match by looking for following terminating character
int endIndex = index + base + TERMINATING_CHARACTER;
return checkBuffer.get(endIndex) == index ? index : 0;
}
/**
* Check match in tail array
* @param base
* @param index
* @param key
* @return index if it is complete match. 0 if it is prefix match. negative value if it doesn't match
*/
private int matchTail(int base, int index, String key) {
int positionInTailArr = base - TAIL_OFFSET;
int keyLength = key.length();
for(int i = 0; i < keyLength; i++) {
if(key.charAt(i) != tailBuffer.get(positionInTailArr + i)){
return -1;
}
}
return tailBuffer.get(positionInTailArr + keyLength) == TERMINATING_CHARACTER ? index : 0;
}
/**
* Find base value for current node, which contains input nodes. They are children of current node.
* Set default base value , which is one, at the index of each input node.
* @param index
* @param nodes
* @return base value for current node
*/
private int findBase(int index, Node[] nodes){
int base = baseBuffer.get(index);
if(base < 0) {
return base;
}
while(true) {
boolean collision = false; // already taken?
for(Node node : nodes) {
/*
* NOTE:
* Originally, nextIndex is base + node.getKey(). But to reduce construction time, we use index + base + node.getKey().
* However, this makes array bigger. If there is a need to compat the file dat.dat, it's possbile to modify here and there.
* Although the size of jar file doesn't change, memory consumption will be smaller.
*/
int nextIndex = index + base + node.getKey();
if(baseBuffer.capacity() <= nextIndex) {
int newLength = nextIndex + 1;
IntBuffer newBaseBuffer = ByteBuffer.allocate(newLength * 4).asIntBuffer();
baseBuffer.rewind();
newBaseBuffer.put(baseBuffer);
baseBuffer = newBaseBuffer;
IntBuffer newCheckBuffer = ByteBuffer.allocate(newLength * 4).asIntBuffer();
checkBuffer.rewind();
newCheckBuffer.put(checkBuffer);
checkBuffer = newCheckBuffer;
}
if(baseBuffer.get(nextIndex) != 0) { // already taken
base++; // check next base value
collision = true;
break;
}
}
if(!collision){
break; // if there is no collision, found proper base value. Break the while loop.
}
}
for(Node node : nodes) {
baseBuffer.put(index + base + node.getKey(), node.getKey() == TERMINATING_CHARACTER ? -1 : 1); // Set -1 if key is terminating character. Set default base value 1 if not.
}
return base;
}
/**
* Add characters(nodes) to tail array
* @param node
*/
private void addToTail(Node node) {
while(true) {
if(tailBuffer.capacity() < tailIndex - TAIL_OFFSET + 1){
CharBuffer newTailBuffer = ByteBuffer.allocate((tailBuffer.capacity() + TAIL_INITIAL_SIZE / 100) * 2).asCharBuffer();
tailBuffer.rewind();
newTailBuffer.put(tailBuffer);
tailBuffer = newTailBuffer;
}
tailBuffer.put(tailIndex++ - TAIL_OFFSET, node.getKey());// set character of current node
if(node.getChildren().length == 0) { // if it reached the end of input, break.
break;
}
node = node.getChildren()[0]; // Move to next node
}
}
}