/** * Copyright 2007 The Apache Software Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package net.paoding.analysis.dictionary; import java.util.HashMap; import java.util.Map; /** * Dictionary的散列+二叉查找实现。 * <p> * 用于对大数量的,且头字符相同的字符串较多的情况,e.g汉字词语字典。在这种情况下,检索速度将比二叉字典更快。 * <p> * * HashBinaryDictionary以一组已经排序的词语为输入,所有<b>头字符</b>相同的词语划为一个集合作为分字典(使用BinaryDictionary实现)。 * 查找词语时,先根据第一个字符找得分词典(BinaryDictionary实现),再从该分词典中定位该词语。 * <p> * * @author Zhiliang Wang [qieqie.wang@gmail.com] * * @see BinaryDictionary * * @since 1.0 * */ public class HashBinaryDictionary implements Dictionary { // ------------------------------------------------- /** * 字典中所有词语,用于方便{@link #get(int)}方法 */ private Word[] ascWords; /** * 首字符到分词典的映射 */ private Map/* <Object, SubDictionaryWrap> */subs; /** * */ private final int hashIndex; private final int start; private final int end; private final int count; // ------------------------------------------------- /** * * @param ascWords * 升序排列词语 * @param initialCapacity * @param loadFactor */ public HashBinaryDictionary(Word[] ascWords, int initialCapacity, float loadFactor) { this(ascWords, 0, 0, ascWords.length, initialCapacity, loadFactor); } public HashBinaryDictionary(Word[] ascWords, int hashIndex, int start, int end, int initialCapacity, float loadFactor) { this.ascWords = ascWords; this.start = start; this.end = end; this.count = end - start; this.hashIndex = hashIndex; subs = new HashMap/* <Object, SubDictionaryWrap> */(initialCapacity, loadFactor); createSubDictionaries(); } // ------------------------------------------------- /** * 创建分词典映射,为构造函数调用 */ protected void createSubDictionaries() { if (this.start >= ascWords.length) { return; } // 定位相同头字符词语的开头和结束位置以确认分字典 int beginIndex = this.start; int endIndex = this.start + 1; char beginHashChar = getChar(ascWords[start], hashIndex); char endHashChar; for (; endIndex < this.end; endIndex++) { endHashChar = getChar(ascWords[endIndex], hashIndex); if (endHashChar != beginHashChar) { addSubDictionary(beginHashChar, beginIndex, endIndex); beginIndex = endIndex; beginHashChar = endHashChar; } } addSubDictionary(beginHashChar, beginIndex, this.end); } protected char getChar(CharSequence s, int index) { if (index >= s.length()) { return (char) 0; } return s.charAt(index); } /** * 将位置在beginIndex和endIndex之间(不包括endIndex)的词语作为一个分词典 * * @param hashChar * @param beginIndex * @param endIndex */ protected void addSubDictionary(char hashChar, int beginIndex, int endIndex) { Dictionary subDic = createSubDictionary(ascWords, beginIndex, endIndex); SubDictionaryWrap subDicWrap = new SubDictionaryWrap(hashChar, subDic, beginIndex); subs.put(keyOf(hashChar), subDicWrap); } protected Dictionary createSubDictionary(Word[] ascWords, int beginIndex, int endIndex) { int count = endIndex - beginIndex; if (count < 16) { return new BinaryDictionary(ascWords, beginIndex, endIndex); } else { return new HashBinaryDictionary(ascWords, hashIndex + 1, beginIndex, endIndex, getCapacity(count), 0.75f); } } protected static final int[] capacityCandiate = { 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 10192 }; protected int getCapacity(int count) { int capacity = -1; count <<= 2; count /= 3; count += 1; for (int i = 0; i < capacityCandiate.length; i++) { if (count <= capacityCandiate[i]) { capacity = capacityCandiate[i]; break; } } if (capacity < 0) { capacity = capacityCandiate[capacityCandiate.length - 1]; } return capacity; } // ------------------------------------------------- public Word get(int index) { return ascWords[start + index]; } public Hit search(CharSequence input, int begin, int count) { SubDictionaryWrap subDic = (SubDictionaryWrap) subs.get(keyOf(input .charAt(hashIndex + begin))); if (subDic == null) { return Hit.UNDEFINED; } Dictionary dic = subDic.dic; // 对count==hashIndex + 1的处理 if (count == hashIndex + 1) { Word header = dic.get(0); if (header.length() == hashIndex + 1) { if (subDic.wordIndexOffset + 1 < this.ascWords.length) { return new Hit(subDic.wordIndexOffset, header, this.ascWords[subDic.wordIndexOffset + 1]); } else { return new Hit(subDic.wordIndexOffset, header, null); } } else { return new Hit(Hit.UNCLOSED_INDEX, null, header); } } // count > hashIndex + 1 Hit word = dic.search(input, begin, count); if (word.isHit()) { int index = subDic.wordIndexOffset + word.getIndex(); word.setIndex(index); if (word.getNext() == null && index < size()) { word.setNext(get(index + 1)); } } return word; } public int size() { return count; } // ------------------------------------------------- /** * 字符的在{@link #subs}的key值。 * * @param theChar * @return * * @see #subs */ protected Object keyOf(char theChar) { // return theChar - 0x4E00;// '一'==0x4E00 return new Integer(theChar); } /** * 分词典封箱 */ static class SubDictionaryWrap { /** * 分词典词组的头字符 */ char hashChar; /** * 分词典 */ Dictionary dic; /** * 分词典第一个词语在所有词语中的偏移位置 */ int wordIndexOffset; public SubDictionaryWrap(char hashChar, Dictionary dic, int wordIndexOffset) { this.hashChar = hashChar; this.dic = dic; this.wordIndexOffset = wordIndexOffset; } } }