/**
* Copyright © 2010-2012 Atilika Inc. All rights reserved.
*
* Atilika Inc. licenses this file to you under the Apache License, Version
* 2.0 (the "License"); you may not use this file except in compliance with
* the License. A copy of the License is distributed with this work in the
* LICENSE.txt file. You may also obtain a copy of the License from
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations
* under the License.
*/
package org.atilika.kuromoji.dict;
import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.nio.ByteBuffer;
import java.nio.channels.Channels;
import java.nio.channels.ReadableByteChannel;
import java.nio.channels.WritableByteChannel;
import org.atilika.kuromoji.util.CSVUtil;
/**
* 0.7.5版から変更あり。getBaseformが追加
*
* @author Masaru Hasegawa
* @author Christian Moen
*/
public class TokenInfoDictionary implements Dictionary{
public static final String FILENAME = "tid.dat";
public static final String TARGETMAP_FILENAME = "tid_map.dat";
protected ByteBuffer buffer;
protected int[][] targetMap;
public TokenInfoDictionary() {
}
public TokenInfoDictionary(int size) {
targetMap = new int[1][];
buffer = ByteBuffer.allocate(size);
}
/**
* put the entry in map
* @param wordId
* @param entry
* @return current position of buffer, which will be wordId of next entry
*/
public int put(String[] entry) {
short leftId = Short.parseShort(entry[1]);
short rightId = Short.parseShort(entry[2]);
short wordCost = Short.parseShort(entry[3]);
StringBuilder sb = new StringBuilder();
for (int i = 4; i < entry.length; i++){
sb.append(entry[i]).append(INTERNAL_SEPARATOR);
}
String features = sb.deleteCharAt(sb.length() - 1).toString();
int featuresSize = features.length()* 2;
// extend buffer if necessary
int left = buffer.limit() - buffer.position();
if (8 + featuresSize > left) { // four short and features
ByteBuffer newBuffer = ByteBuffer.allocate(buffer.limit() * 2);
buffer.flip();
newBuffer.put(buffer);
buffer = newBuffer;
}
buffer.putShort(leftId);
buffer.putShort(rightId);
buffer.putShort(wordCost);
buffer.putShort((short)featuresSize);
for (char c : features.toCharArray()){
buffer.putChar(c);
}
return buffer.position();
}
public void addMapping(int sourceId, int wordId) {
if(targetMap.length <= sourceId) {
int[][] newArray = new int[sourceId + 1][];
System.arraycopy(targetMap, 0, newArray, 0, targetMap.length);
targetMap = newArray;
}
// Prepare array -- extend the length of array by one
int[] current = targetMap[sourceId];
if (current == null) {
current = new int[1];
} else {
int[] newArray = new int[current.length + 1];
System.arraycopy(current, 0, newArray, 0, current.length);
current = newArray;
}
targetMap[sourceId] = current;
int[] targets = targetMap[sourceId];
targets[targets.length - 1] = wordId;
}
public int[] lookupWordIds(int sourceId) {
return targetMap[sourceId];
}
@Override
public int getLeftId(int wordId) {
return buffer.getShort(wordId);
}
@Override
public int getRightId(int wordId) {
return buffer.getShort(wordId + 2); // Skip left id
}
@Override
public int getWordCost(int wordId) {
return buffer.getShort(wordId + 4); // Skip left id and right id
}
@Override
public String[] getAllFeaturesArray(int wordId) {
int size = buffer.getShort(wordId + 6) / 2; // Read length of feature String. Skip 6 bytes, see data structure.
char[] targetArr = new char[size];
int offset = wordId + 6 + 2; // offset is position where features string starts
for(int i = 0; i < size; i++){
targetArr[i] = buffer.getChar(offset + i * 2);
}
String allFeatures = new String(targetArr);
return allFeatures.split(INTERNAL_SEPARATOR);
}
@Override
public String getFeature(int wordId, int... fields) {
String[] allFeatures = getAllFeaturesArray(wordId);
StringBuilder sb = new StringBuilder();
if(fields.length == 0){ // All features
for(String feature : allFeatures) {
sb.append(CSVUtil.quoteEscape(feature)).append(",");
}
} else if(fields.length == 1) { // One feature doesn't need to escape value
sb.append(allFeatures[fields[0]]).append(",");
} else {
for(int field : fields){
sb.append(CSVUtil.quoteEscape(allFeatures[field])).append(",");
}
}
return sb.deleteCharAt(sb.length() - 1).toString();
}
@Override
public String getReading(int wordId) {
return getFeature(wordId, 7);
}
@Override
public String getAllFeatures(int wordId) {
return getFeature(wordId);
}
@Override
public String getPartOfSpeech(int wordId) {
return getFeature(wordId, 0, 1, 2, 3);
}
@Override
public String getBaseForm(int wordId) {
return getFeature(wordId, 6);
}
/**
* Write dictionary in file
* Dictionary format is:
* [Size of dictionary(int)], [entry:{left id(short)}{right id(short)}{word cost(short)}{length of pos info(short)}{pos info(char)}], [entry...], [entry...].....
* @param filename
* @throws IOException
*/
public void write(String directoryname) throws IOException {
writeDictionary(directoryname + File.separator + FILENAME);
writeTargetMap(directoryname + File.separator + TARGETMAP_FILENAME);
}
protected void writeTargetMap(String filename) throws IOException {
ObjectOutputStream oos = new ObjectOutputStream(new BufferedOutputStream(new FileOutputStream(filename)));
oos.writeObject(targetMap);
oos.close();
}
protected void writeDictionary(String filename) throws IOException {
FileOutputStream fos = new FileOutputStream(filename);
DataOutputStream dos = new DataOutputStream(fos);
dos.writeInt(buffer.position());
WritableByteChannel channel = Channels.newChannel(fos);
// Write Buffer
buffer.flip(); // set position to 0, set limit to current position
channel.write(buffer);
fos.close();
}
/**
* TONIXY ファイル名を指定するための引数追加
*
* Read dictionary into directly allocated buffer.
* @return TokenInfoDictionary instance
* @throws IOException
* @throws ClassNotFoundException
*/
public static TokenInfoDictionary getInstance(String fileNamePrefix) throws IOException, ClassNotFoundException {
TokenInfoDictionary dictionary = new TokenInfoDictionary();
ClassLoader loader = dictionary.getClass().getClassLoader(); // 辞書ファイルのパスを取ってるだけっぽい
dictionary.loadDictionary(loader.getResourceAsStream(fileNamePrefix + FILENAME));
dictionary.loadTargetMap(loader.getResourceAsStream(fileNamePrefix + TARGETMAP_FILENAME));
return dictionary;
}
public static TokenInfoDictionary getInstance() throws IOException, ClassNotFoundException {
return getInstance("");
}
protected void loadTargetMap(InputStream is) throws IOException, ClassNotFoundException {
ObjectInputStream ois = new ObjectInputStream(new BufferedInputStream(is));
targetMap = (int[][]) ois.readObject(); // tid_map.dat、unk_map.datにはint[][]型のオブジェクトが入っている(BufferedOutputStreamによる出力)
is.close();
}
protected void loadDictionary(InputStream is) throws IOException {
DataInputStream dis = new DataInputStream(is);
int size = dis.readInt();
ByteBuffer tmpBuffer = ByteBuffer.allocateDirect(size);
ReadableByteChannel channel = Channels.newChannel(is);
channel.read(tmpBuffer);
is.close();
buffer = tmpBuffer.asReadOnlyBuffer(); //bufferを、tmpBufferと内容を共有する読み込み専用バッファとする
}
}