/** * Copyright © 2010-2012 Atilika Inc. All rights reserved. * * Atilika Inc. licenses this file to you under the Apache License, Version * 2.0 (the "License"); you may not use this file except in compliance with * the License. A copy of the License is distributed with this work in the * LICENSE.txt file. You may also obtain a copy of the License from * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations * under the License. */ package org.atilika.kuromoji.dict; import java.io.BufferedInputStream; import java.io.BufferedOutputStream; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.ObjectInputStream; import java.io.ObjectOutputStream; import org.atilika.kuromoji.dict.CharacterDefinition.CharacterClass; /** * 0.7.5版から変更あり。getBaseformが追加(必ずnullを返す) * * @author Masaru Hasegawa * @author Christian Moen */ public class UnknownDictionary extends TokenInfoDictionary { public static final String FILENAME = "unk.dat"; public static final String TARGETMAP_FILENAME = "unk_map.dat"; public static final String CHARDEF_FILENAME = "cd.dat"; private CharacterDefinition characterDefinition; /** * Constructor */ public UnknownDictionary() { } public UnknownDictionary(int size) { super(size); characterDefinition = new CharacterDefinition(); } @Override public int put(String[] entry) { // Get wordId of current entry int wordId = buffer.position(); // Put entry int result = super.put(entry); // Put entry in targetMap int characterId = CharacterClass.valueOf(entry[0]).getId(); addMapping(characterId, wordId); return result; } public int lookup(String text) { if(!characterDefinition.isGroup(text.charAt(0))) { return 1; } // Extract unknown word. Characters with the same character class are considered to be part of unknown word int characterIdOfFirstCharacter = characterDefinition.lookup(text.charAt(0)); int length = 1; for (int i = 1; i < text.length(); i++) { if (characterIdOfFirstCharacter == characterDefinition.lookup(text.charAt(i))){ length++; } else { break; } } return length; } /** * Put mapping from unicode code point to character class. * * @param codePoint code point * @param class character class name */ public void putCharacterCategory(int codePoint, String characterClassName) { characterDefinition.putCharacterCategory(codePoint, characterClassName); } public void putInvokeDefinition(String characterClassName, int invoke, int group, int length) { characterDefinition.putInvokeDefinition(characterClassName, invoke, group, length); } public CharacterDefinition getCharacterDefinition() { return characterDefinition; } /** * Write dictionary in file * Dictionary format is: * [Size of dictionary(int)], [entry:{left id(short)}{right id(short)}{word cost(short)}{length of pos info(short)}{pos info(char)}], [entry...], [entry...]..... * @param filename * @throws IOException */ public void write(String directoryname) throws IOException { writeDictionary(directoryname + File.separator + FILENAME); writeTargetMap(directoryname + File.separator + TARGETMAP_FILENAME); writeCharDef(directoryname + File.separator + CHARDEF_FILENAME); } protected void writeCharDef(String filename) throws IOException { ObjectOutputStream oos = new ObjectOutputStream(new BufferedOutputStream(new FileOutputStream(filename))); oos.writeObject(characterDefinition); oos.close(); } // TONIXY ファイル名を指定するための引数追加 public static UnknownDictionary getInstance(String fileNamePrefix) throws IOException, ClassNotFoundException { UnknownDictionary dictionary = new UnknownDictionary(); ClassLoader loader = dictionary.getClass().getClassLoader(); dictionary.loadDictionary(loader.getResourceAsStream(fileNamePrefix + FILENAME)); dictionary.loadTargetMap(loader.getResourceAsStream(fileNamePrefix + TARGETMAP_FILENAME)); dictionary.loadCharDef(loader.getResourceAsStream(fileNamePrefix + CHARDEF_FILENAME)); return dictionary; } public static UnknownDictionary getInstance() throws IOException, ClassNotFoundException { return getInstance(""); } protected void loadCharDef(InputStream is) throws IOException, ClassNotFoundException { ObjectInputStream ois = new ObjectInputStream(new BufferedInputStream(is)); characterDefinition = (CharacterDefinition) ois.readObject(); ois.close(); } @Override public String getReading(int wordId) { return null; } @Override public String getBaseForm(int wordId) { return null; } }