/** * Created by jpbirdy on 14-11-22. */ package jpbirdy.segment; import java.util.ArrayList; import java.util.List; /** * @author jpbirdy * @project Segmentation * @class SegDict * @date 14-11-22 20:39 * @desc */ public class SegDict { private SegNode root; private int maxLength; private int num; private List<SegToken> tokens; private long totalFrequency; public SegDict() { root = new SegNode(); maxLength = 0; num = 0; tokens = new ArrayList<SegToken>(); totalFrequency = 0; } public SegNode getRoot() { return root; } public void setRoot(SegNode root) { this.root = root; } public int getMaxLength() { return maxLength; } public void setMaxLength(int maxLength) { this.maxLength = maxLength; } public int getNum() { return num; } public void setNum(int num) { this.num = num; } public List<SegToken> getTokens() { return tokens; } public void setTokens(List<SegToken> tokens) { this.tokens = tokens; } public long getTotalFrequency() { return totalFrequency; } public void setTotalFrequency(long totalFrequency) { this.totalFrequency = totalFrequency; } ////////end getter and setter public static class SearchNode { private int index; private boolean success; public SearchNode(int i, boolean s) { index = i; success = s; } public int getIndex() { return index; } public boolean isSuccess() { return success; } } // 二分法查找字元在子节点中的位置 // 如果查找成功,第一个返回参数为找到的位置,第二个返回参数为true // 如果查找失败,第一个返回参数为应当插入的位置,第二个返回参数false public SearchNode binarySearch(List<SegNode> nodes, String word) { int start = 0; int end = nodes.size() - 1; //nodes为空,直接插入 if (end < 0) return new SearchNode(0, false); //首尾优化 int compareWithFirstWord = word.compareTo(nodes.get(start).getWord()); if (compareWithFirstWord < 0) { return new SearchNode(start, false); } else if (compareWithFirstWord == 0) { return new SearchNode(start, true); } int compareWithLastWord = word.compareTo(nodes.get(end).getWord()); if (compareWithLastWord == 0) { return new SearchNode(end, true); } else if (compareWithLastWord > 0) { return new SearchNode(end + 1, false); } //二分 int current = (start + end) >> 1; while ((end - start) > 1) { int compareWithCurrentWord = word.compareTo(nodes.get(current).getWord()); if (compareWithCurrentWord == 0) return new SearchNode(current, true); else if (compareWithCurrentWord < 0) end = current; else start = current; current = (start + end) >> 1; } return new SearchNode(end, false); } // 将字元加入节点数组中,并返回插入的节点指针 // 如果字元已经存在则返回存在的节点指针 public SegNode upsert(List<SegNode> nodes, String word) { SearchNode searchNode = binarySearch(nodes, word); //查找成功 if (searchNode.isSuccess()) { return nodes.get(searchNode.getIndex()); } if (nodes == null) nodes = new ArrayList<SegNode>(); nodes.add(searchNode.getIndex(), new SegNode(word)); return nodes.get(searchNode.getIndex()); } public SegDict addToken(SegToken token) { SegNode current = root; for (String word : token.getText()) { current = upsert(current.getChildren(), word); } if (current.getToken() == null) { current.setToken(token); if (token.getText().size() > maxLength) { maxLength = token.getText().size(); } num++; tokens.add(token); totalFrequency += token.getFrequency(); } return this; } // 在词典中查找和字元组words可以前缀匹配的所有分词 // 返回值为找到的分词数 public int lookupTokens(List<String> words, List<SegToken> tokens) { if (words.size() == 0) return 0; SegNode current = root; int numTokens = 0; for (String word : words) { // 如果已经抵达叶子节点则不再继续寻找 if (current.getChildren().size() == 0) break; SearchNode searchNode = binarySearch(current.getChildren(), word); // 否则在该节点子节点中进行下个字元的匹配 if (!searchNode.isSuccess()) { break; } current = current.getChildren().get(searchNode.getIndex()); if (current.getToken() != null) { if (tokens.size() - 1 < numTokens) tokens.add(null); tokens.set(numTokens, current.getToken()); numTokens++; } } return numTokens; } public static void main(String[] args) throws Exception { // List<Integer> list = new ArrayList<Integer>(); // list.add(1); // list.add(2); // list.add(3); // list.add(4); // list.add(5); // list.add(6); // list.add(1,99); // System.out.println(list); } }