/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.analysis.cn.smart.hhmm; import java.util.ArrayList; import java.util.Collection; import java.util.HashMap; import java.util.List; import java.util.Map; import org.apache.lucene.analysis.cn.smart.Utility; /** * Graph representing possible token pairs (bigrams) at each start offset in the sentence. * <p> * For each start offset, a list of possible token pairs is stored. * </p> * @lucene.experimental */ class BiSegGraph { private Map<Integer,ArrayList<SegTokenPair>> tokenPairListTable = new HashMap<Integer,ArrayList<SegTokenPair>>(); private List<SegToken> segTokenList; private static BigramDictionary bigramDict = BigramDictionary.getInstance(); public BiSegGraph(SegGraph segGraph) { segTokenList = segGraph.makeIndex(); generateBiSegGraph(segGraph); } /* * Generate a BiSegGraph based upon a SegGraph */ private void generateBiSegGraph(SegGraph segGraph) { double smooth = 0.1; int wordPairFreq = 0; int maxStart = segGraph.getMaxStart(); double oneWordFreq, weight, tinyDouble = 1.0 / Utility.MAX_FREQUENCE; int next; char[] idBuffer; // get the list of tokens ordered and indexed segTokenList = segGraph.makeIndex(); // Because the beginning position of startToken is -1, therefore startToken can be obtained when key = -1 int key = -1; List<SegToken> nextTokens = null; while (key < maxStart) { if (segGraph.isStartExist(key)) { List<SegToken> tokenList = segGraph.getStartList(key); // Calculate all tokens for a given key. for (SegToken t1 : tokenList) { oneWordFreq = t1.weight; next = t1.endOffset; nextTokens = null; // Find the next corresponding Token. // For example: "Sunny seashore", the present Token is "sunny", next one should be "sea" or "seashore". // If we cannot find the next Token, then go to the end and repeat the same cycle. while (next <= maxStart) { // Because the beginning position of endToken is sentenceLen, so equal to sentenceLen can find endToken. if (segGraph.isStartExist(next)) { nextTokens = segGraph.getStartList(next); break; } next++; } if (nextTokens == null) { break; } for (SegToken t2 : nextTokens) { idBuffer = new char[t1.charArray.length + t2.charArray.length + 1]; System.arraycopy(t1.charArray, 0, idBuffer, 0, t1.charArray.length); idBuffer[t1.charArray.length] = BigramDictionary.WORD_SEGMENT_CHAR; System.arraycopy(t2.charArray, 0, idBuffer, t1.charArray.length + 1, t2.charArray.length); // Two linked Words frequency wordPairFreq = bigramDict.getFrequency(idBuffer); // Smoothing // -log{a*P(Ci-1)+(1-a)P(Ci|Ci-1)} Note 0<a<1 weight = -Math .log(smooth * (1.0 + oneWordFreq) / (Utility.MAX_FREQUENCE + 0.0) + (1.0 - smooth) * ((1.0 - tinyDouble) * wordPairFreq / (1.0 + oneWordFreq) + tinyDouble)); SegTokenPair tokenPair = new SegTokenPair(idBuffer, t1.index, t2.index, weight); this.addSegTokenPair(tokenPair); } } } key++; } } /** * Returns true if their is a list of token pairs at this offset (index of the second token) * * @param to index of the second token in the token pair * @return true if a token pair exists */ public boolean isToExist(int to) { return tokenPairListTable.get(Integer.valueOf(to)) != null; } /** * Return a {@link List} of all token pairs at this offset (index of the second token) * * @param to index of the second token in the token pair * @return {@link List} of token pairs. */ public List<SegTokenPair> getToList(int to) { return tokenPairListTable.get(to); } /** * Add a {@link SegTokenPair} * * @param tokenPair {@link SegTokenPair} */ public void addSegTokenPair(SegTokenPair tokenPair) { int to = tokenPair.to; if (!isToExist(to)) { ArrayList<SegTokenPair> newlist = new ArrayList<SegTokenPair>(); newlist.add(tokenPair); tokenPairListTable.put(to, newlist); } else { List<SegTokenPair> tokenPairList = tokenPairListTable.get(to); tokenPairList.add(tokenPair); } } /** * Get the number of {@link SegTokenPair} entries in the table. * @return number of {@link SegTokenPair} entries */ public int getToCount() { return tokenPairListTable.size(); } /** * Find the shortest path with the Viterbi algorithm. * @return {@link List} */ public List<SegToken> getShortPath() { int current; int nodeCount = getToCount(); List<PathNode> path = new ArrayList<PathNode>(); PathNode zeroPath = new PathNode(); zeroPath.weight = 0; zeroPath.preNode = 0; path.add(zeroPath); for (current = 1; current <= nodeCount; current++) { double weight; List<SegTokenPair> edges = getToList(current); double minWeight = Double.MAX_VALUE; SegTokenPair minEdge = null; for (SegTokenPair edge : edges) { weight = edge.weight; PathNode preNode = path.get(edge.from); if (preNode.weight + weight < minWeight) { minWeight = preNode.weight + weight; minEdge = edge; } } PathNode newNode = new PathNode(); newNode.weight = minWeight; newNode.preNode = minEdge.from; path.add(newNode); } // Calculate PathNodes int preNode, lastNode; lastNode = path.size() - 1; current = lastNode; List<Integer> rpath = new ArrayList<Integer>(); List<SegToken> resultPath = new ArrayList<SegToken>(); rpath.add(current); while (current != 0) { PathNode currentPathNode = path.get(current); preNode = currentPathNode.preNode; rpath.add(Integer.valueOf(preNode)); current = preNode; } for (int j = rpath.size() - 1; j >= 0; j--) { Integer idInteger = (Integer) rpath.get(j); int id = idInteger.intValue(); SegToken t = segTokenList.get(id); resultPath.add(t); } return resultPath; } @Override public String toString() { StringBuilder sb = new StringBuilder(); Collection<ArrayList<SegTokenPair>> values = tokenPairListTable.values(); for (ArrayList<SegTokenPair> segList : values) { for (SegTokenPair pair : segList) { sb.append(pair + "\n"); } } return sb.toString(); } }