package com.antbrains.wordseg; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map.Entry; import com.antbrains.crf.hadoop.FileTools; import com.antbrains.datrie.DoubleArrayTrie; /** * 正向最大匹配分词 * * @author lili * */ public class MMSeg { private DoubleArrayTrie trie; public MMSeg(DoubleArrayTrie trie) { this.trie = trie; } public MMSeg(List<String> wordList) { trie = new DoubleArrayTrie(); for (String word : wordList) { trie.coverInsert(word, 0); } } public MMSeg(String dictPath) { try { List<String> wordList = FileTools.readFile2List(dictPath, "UTF-8"); trie = new DoubleArrayTrie(); for (String word : wordList) { word=word.trim(); if(word.length()<2) continue; trie.coverInsert(word, 0); } } catch (IOException e) { e.printStackTrace(); } } private String cnNumbers = "零一二三四五六七八九十百千万亿"; public boolean processNumber = false; public List<Token> seg(String sen) { List<Token> tokens = new ArrayList<Token>(); for (int i = 0; i < sen.length(); i++) { String ch = sen.substring(i, i + 1); int len = trie.find(sen, i)[0]; if (processNumber && cnNumbers.contains(ch)) { // 处理汉字里的数字,阿拉伯数字在前面lucene的分析器已经处理过了 int j = i + 1; for (; j < sen.length(); j++) { ch = sen.substring(j, j + 1); if (!cnNumbers.contains(ch)) break; } if (j - i > 1 && j - i > len) { tokens.add(new Token(sen, i, j)); i += (j - i - 1); continue; } } if (len > 1) { tokens.add(new Token(sen, i, i + len)); i += (len - 1); } else { tokens.add(new Token(sen, i, i + 1)); } } return tokens; } }