/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.analysis.cn.smart.hhmm; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.ObjectInputStream; import java.io.ObjectOutputStream; import java.io.RandomAccessFile; import java.io.UnsupportedEncodingException; import java.nio.ByteBuffer; import java.nio.ByteOrder; import org.apache.lucene.analysis.cn.smart.AnalyzerProfile; import org.apache.lucene.analysis.cn.smart.Utility; /** * SmartChineseAnalyzer Word Dictionary * @lucene.experimental */ class WordDictionary extends AbstractDictionary { private WordDictionary() { } private static WordDictionary singleInstance; /** * Large prime number for hash function */ public static final int PRIME_INDEX_LENGTH = 12071; /** * wordIndexTable guarantees to hash all Chinese characters in Unicode into * PRIME_INDEX_LENGTH array. There will be conflict, but in reality this * program only handles the 6768 characters found in GB2312 plus some * ASCII characters. Therefore in order to guarantee better precision, it is * necessary to retain the original symbol in the charIndexTable. */ private short[] wordIndexTable; private char[] charIndexTable; /** * To avoid taking too much space, the data structure needed to store the * lexicon requires two multidimensional arrays to store word and frequency. * Each word is placed in a char[]. Each char represents a Chinese char or * other symbol. Each frequency is put into an int. These two arrays * correspond to each other one-to-one. Therefore, one can use * wordItem_charArrayTable[i][j] to look up word from lexicon, and * wordItem_frequencyTable[i][j] to look up the corresponding frequency. */ private char[][][] wordItem_charArrayTable; private int[][] wordItem_frequencyTable; // static Logger log = Logger.getLogger(WordDictionary.class); /** * Get the singleton dictionary instance. * @return singleton */ public synchronized static WordDictionary getInstance() { if (singleInstance == null) { singleInstance = new WordDictionary(); try { singleInstance.load(); } catch (IOException e) { String wordDictRoot = AnalyzerProfile.ANALYSIS_DATA_DIR; singleInstance.load(wordDictRoot); } catch (ClassNotFoundException e) { throw new RuntimeException(e); } } return singleInstance; } /** * Attempt to load dictionary from provided directory, first trying coredict.mem, failing back on coredict.dct * * @param dctFileRoot path to dictionary directory */ public void load(String dctFileRoot) { String dctFilePath = dctFileRoot + "/coredict.dct"; File serialObj = new File(dctFileRoot + "/coredict.mem"); if (serialObj.exists() && loadFromObj(serialObj)) { } else { try { wordIndexTable = new short[PRIME_INDEX_LENGTH]; charIndexTable = new char[PRIME_INDEX_LENGTH]; for (int i = 0; i < PRIME_INDEX_LENGTH; i++) { charIndexTable[i] = 0; wordIndexTable[i] = -1; } wordItem_charArrayTable = new char[GB2312_CHAR_NUM][][]; wordItem_frequencyTable = new int[GB2312_CHAR_NUM][]; // int total = loadMainDataFromFile(dctFilePath); expandDelimiterData(); mergeSameWords(); sortEachItems(); // log.info("load dictionary: " + dctFilePath + " total:" + total); } catch (IOException e) { throw new RuntimeException(e.getMessage()); } saveToObj(serialObj); } } /** * Load coredict.mem internally from the jar file. * * @throws ClassNotFoundException * @throws IOException */ public void load() throws IOException, ClassNotFoundException { InputStream input = this.getClass().getResourceAsStream("coredict.mem"); loadFromObjectInputStream(input); } private boolean loadFromObj(File serialObj) { try { loadFromObjectInputStream(new FileInputStream(serialObj)); return true; } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } catch (ClassNotFoundException e) { e.printStackTrace(); } return false; } private void loadFromObjectInputStream(InputStream serialObjectInputStream) throws IOException, ClassNotFoundException { ObjectInputStream input = new ObjectInputStream(serialObjectInputStream); wordIndexTable = (short[]) input.readObject(); charIndexTable = (char[]) input.readObject(); wordItem_charArrayTable = (char[][][]) input.readObject(); wordItem_frequencyTable = (int[][]) input.readObject(); // log.info("load core dict from serialization."); input.close(); } private void saveToObj(File serialObj) { try { ObjectOutputStream output = new ObjectOutputStream(new FileOutputStream( serialObj)); output.writeObject(wordIndexTable); output.writeObject(charIndexTable); output.writeObject(wordItem_charArrayTable); output.writeObject(wordItem_frequencyTable); output.close(); // log.info("serialize core dict."); } catch (Exception e) { // log.warn(e.getMessage()); } } /** * Load the datafile into this WordDictionary * * @param dctFilePath path to word dictionary (coredict.dct) * @return number of words read * @throws FileNotFoundException * @throws IOException * @throws UnsupportedEncodingException */ private int loadMainDataFromFile(String dctFilePath) throws FileNotFoundException, IOException, UnsupportedEncodingException { int i, cnt, length, total = 0; // The file only counted 6763 Chinese characters plus 5 reserved slots 3756~3760. // The 3756th is used (as a header) to store information. int[] buffer = new int[3]; byte[] intBuffer = new byte[4]; String tmpword; RandomAccessFile dctFile = new RandomAccessFile(dctFilePath, "r"); // GB2312 characters 0 - 6768 for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++) { // if (i == 5231) // System.out.println(i); dctFile.read(intBuffer); // the dictionary was developed for C, and byte order must be converted to work with Java cnt = ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN).getInt(); if (cnt <= 0) { wordItem_charArrayTable[i] = null; wordItem_frequencyTable[i] = null; continue; } wordItem_charArrayTable[i] = new char[cnt][]; wordItem_frequencyTable[i] = new int[cnt]; total += cnt; int j = 0; while (j < cnt) { // wordItemTable[i][j] = new WordItem(); dctFile.read(intBuffer); buffer[0] = ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN) .getInt();// frequency dctFile.read(intBuffer); buffer[1] = ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN) .getInt();// length dctFile.read(intBuffer); buffer[2] = ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN) .getInt();// handle // wordItemTable[i][j].frequency = buffer[0]; wordItem_frequencyTable[i][j] = buffer[0]; length = buffer[1]; if (length > 0) { byte[] lchBuffer = new byte[length]; dctFile.read(lchBuffer); tmpword = new String(lchBuffer, "GB2312"); // indexTable[i].wordItems[j].word = tmpword; // wordItemTable[i][j].charArray = tmpword.toCharArray(); wordItem_charArrayTable[i][j] = tmpword.toCharArray(); } else { // wordItemTable[i][j].charArray = null; wordItem_charArrayTable[i][j] = null; } // System.out.println(indexTable[i].wordItems[j]); j++; } String str = getCCByGB2312Id(i); setTableIndex(str.charAt(0), i); } dctFile.close(); return total; } /** * The original lexicon puts all information with punctuation into a * chart (from 1 to 3755). Here it then gets expanded, separately being * placed into the chart that has the corresponding symbol. */ private void expandDelimiterData() { int i; int cnt; // Punctuation then treating index 3755 as 1, // distribute the original punctuation corresponding dictionary into int delimiterIndex = 3755 + GB2312_FIRST_CHAR; i = 0; while (i < wordItem_charArrayTable[delimiterIndex].length) { char c = wordItem_charArrayTable[delimiterIndex][i][0]; int j = getGB2312Id(c);// the id value of the punctuation if (wordItem_charArrayTable[j] == null) { int k = i; // Starting from i, count the number of the following worditem symbol from j while (k < wordItem_charArrayTable[delimiterIndex].length && wordItem_charArrayTable[delimiterIndex][k][0] == c) { k++; } // c is the punctuation character, j is the id value of c // k-1 represents the index of the last punctuation character cnt = k - i; if (cnt != 0) { wordItem_charArrayTable[j] = new char[cnt][]; wordItem_frequencyTable[j] = new int[cnt]; } // Assign value for each wordItem. for (k = 0; k < cnt; k++, i++) { // wordItemTable[j][k] = new WordItem(); wordItem_frequencyTable[j][k] = wordItem_frequencyTable[delimiterIndex][i]; wordItem_charArrayTable[j][k] = new char[wordItem_charArrayTable[delimiterIndex][i].length - 1]; System.arraycopy(wordItem_charArrayTable[delimiterIndex][i], 1, wordItem_charArrayTable[j][k], 0, wordItem_charArrayTable[j][k].length); } setTableIndex(c, j); } } // Delete the original corresponding symbol array. wordItem_charArrayTable[delimiterIndex] = null; wordItem_frequencyTable[delimiterIndex] = null; } /* * since we aren't doing POS-tagging, merge the frequencies for entries of the same word (with different POS) */ private void mergeSameWords() { int i; for (i = 0; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++) { if (wordItem_charArrayTable[i] == null) continue; int len = 1; for (int j = 1; j < wordItem_charArrayTable[i].length; j++) { if (Utility.compareArray(wordItem_charArrayTable[i][j], 0, wordItem_charArrayTable[i][j - 1], 0) != 0) len++; } if (len < wordItem_charArrayTable[i].length) { char[][] tempArray = new char[len][]; int[] tempFreq = new int[len]; int k = 0; tempArray[0] = wordItem_charArrayTable[i][0]; tempFreq[0] = wordItem_frequencyTable[i][0]; for (int j = 1; j < wordItem_charArrayTable[i].length; j++) { if (Utility.compareArray(wordItem_charArrayTable[i][j], 0, tempArray[k], 0) != 0) { k++; // temp[k] = wordItemTable[i][j]; tempArray[k] = wordItem_charArrayTable[i][j]; tempFreq[k] = wordItem_frequencyTable[i][j]; } else { // temp[k].frequency += wordItemTable[i][j].frequency; tempFreq[k] += wordItem_frequencyTable[i][j]; } } // wordItemTable[i] = temp; wordItem_charArrayTable[i] = tempArray; wordItem_frequencyTable[i] = tempFreq; } } } private void sortEachItems() { char[] tmpArray; int tmpFreq; for (int i = 0; i < wordItem_charArrayTable.length; i++) { if (wordItem_charArrayTable[i] != null && wordItem_charArrayTable[i].length > 1) { for (int j = 0; j < wordItem_charArrayTable[i].length - 1; j++) { for (int j2 = j + 1; j2 < wordItem_charArrayTable[i].length; j2++) { if (Utility.compareArray(wordItem_charArrayTable[i][j], 0, wordItem_charArrayTable[i][j2], 0) > 0) { tmpArray = wordItem_charArrayTable[i][j]; tmpFreq = wordItem_frequencyTable[i][j]; wordItem_charArrayTable[i][j] = wordItem_charArrayTable[i][j2]; wordItem_frequencyTable[i][j] = wordItem_frequencyTable[i][j2]; wordItem_charArrayTable[i][j2] = tmpArray; wordItem_frequencyTable[i][j2] = tmpFreq; } } } } } } /* * Calculate character c's position in hash table, * then initialize the value of that position in the address table. */ private boolean setTableIndex(char c, int j) { int index = getAvaliableTableIndex(c); if (index != -1) { charIndexTable[index] = c; wordIndexTable[index] = (short) j; return true; } else return false; } private short getAvaliableTableIndex(char c) { int hash1 = (int) (hash1(c) % PRIME_INDEX_LENGTH); int hash2 = hash2(c) % PRIME_INDEX_LENGTH; if (hash1 < 0) hash1 = PRIME_INDEX_LENGTH + hash1; if (hash2 < 0) hash2 = PRIME_INDEX_LENGTH + hash2; int index = hash1; int i = 1; while (charIndexTable[index] != 0 && charIndexTable[index] != c && i < PRIME_INDEX_LENGTH) { index = (hash1 + i * hash2) % PRIME_INDEX_LENGTH; i++; } // System.out.println(i - 1); if (i < PRIME_INDEX_LENGTH && (charIndexTable[index] == 0 || charIndexTable[index] == c)) { return (short) index; } else return -1; } private short getWordItemTableIndex(char c) { int hash1 = (int) (hash1(c) % PRIME_INDEX_LENGTH); int hash2 = hash2(c) % PRIME_INDEX_LENGTH; if (hash1 < 0) hash1 = PRIME_INDEX_LENGTH + hash1; if (hash2 < 0) hash2 = PRIME_INDEX_LENGTH + hash2; int index = hash1; int i = 1; while (charIndexTable[index] != 0 && charIndexTable[index] != c && i < PRIME_INDEX_LENGTH) { index = (hash1 + i * hash2) % PRIME_INDEX_LENGTH; i++; } if (i < PRIME_INDEX_LENGTH && charIndexTable[index] == c) { return (short) index; } else return -1; } /** * Look up the text string corresponding with the word char array, * and return the position of the word list. * * @param knownHashIndex already figure out position of the first word * symbol charArray[0] in hash table. If not calculated yet, can be * replaced with function int findInTable(char[] charArray). * @param charArray look up the char array corresponding with the word. * @return word location in word array. If not found, then return -1. */ private int findInTable(short knownHashIndex, char[] charArray) { if (charArray == null || charArray.length == 0) return -1; char[][] items = wordItem_charArrayTable[wordIndexTable[knownHashIndex]]; int start = 0, end = items.length - 1; int mid = (start + end) / 2, cmpResult; // Binary search for the index of idArray while (start <= end) { cmpResult = Utility.compareArray(items[mid], 0, charArray, 1); if (cmpResult == 0) return mid;// find it else if (cmpResult < 0) start = mid + 1; else if (cmpResult > 0) end = mid - 1; mid = (start + end) / 2; } return -1; } /** * Find the first word in the dictionary that starts with the supplied prefix * * @see #getPrefixMatch(char[], int) * @param charArray input prefix * @return index of word, or -1 if not found */ public int getPrefixMatch(char[] charArray) { return getPrefixMatch(charArray, 0); } /** * Find the nth word in the dictionary that starts with the supplied prefix * * @see #getPrefixMatch(char[]) * @param charArray input prefix * @param knownStart relative position in the dictionary to start * @return index of word, or -1 if not found */ public int getPrefixMatch(char[] charArray, int knownStart) { short index = getWordItemTableIndex(charArray[0]); if (index == -1) return -1; char[][] items = wordItem_charArrayTable[wordIndexTable[index]]; int start = knownStart, end = items.length - 1; int mid = (start + end) / 2, cmpResult; // Binary search for the index of idArray while (start <= end) { cmpResult = Utility.compareArrayByPrefix(charArray, 1, items[mid], 0); if (cmpResult == 0) { // Get the first item which match the current word while (mid >= 0 && Utility.compareArrayByPrefix(charArray, 1, items[mid], 0) == 0) mid--; mid++; return mid;// Find the first word that uses charArray as prefix. } else if (cmpResult < 0) end = mid - 1; else start = mid + 1; mid = (start + end) / 2; } return -1; } /** * Get the frequency of a word from the dictionary * * @param charArray input word * @return word frequency, or zero if the word is not found */ public int getFrequency(char[] charArray) { short hashIndex = getWordItemTableIndex(charArray[0]); if (hashIndex == -1) return 0; int itemIndex = findInTable(hashIndex, charArray); if (itemIndex != -1) return wordItem_frequencyTable[wordIndexTable[hashIndex]][itemIndex]; return 0; } /** * Return true if the dictionary entry at itemIndex for table charArray[0] is charArray * * @param charArray input word * @param itemIndex item index for table charArray[0] * @return true if the entry exists */ public boolean isEqual(char[] charArray, int itemIndex) { short hashIndex = getWordItemTableIndex(charArray[0]); return Utility.compareArray(charArray, 1, wordItem_charArrayTable[wordIndexTable[hashIndex]][itemIndex], 0) == 0; } }