package com.fpcms.common.util; import java.io.IOException; import java.io.Reader; import java.io.StringReader; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; import org.apache.commons.lang.StringUtils; import org.wltea.analyzer.core.IKSegmenter; import org.wltea.analyzer.core.Lexeme; /** * 中文分词工具类 * 分词工具使用: http://code.google.com/p/ik-analyzer/ * * @author badqiu * */ public class ChineseSegmenterUtil { /** * 将字符串进行关键字切分,并且返回最短长度为minKeywordLength的关键字列表 * */ public static List<String> getMinLengthKeywords(String str,int minKeywordLength,boolean useSmart) { if(StringUtils.isBlank(str)) { return Collections.EMPTY_LIST; } try { Map<String,Integer> words = segmenteForTokenCount(new StringReader(str),useSmart); List<String> result = new ArrayList(); for(String keyword : words.keySet()) { if(StringUtils.isNotBlank(keyword) && keyword.length() >= minKeywordLength) { result.add(keyword); } } return result; }catch(Exception e) { throw new RuntimeException("cannot split keyword by str:"+str,e); } } /** * 解析得到每个词的次数,返回 Map<词,出现次数> * @param reader * @param useSmart * @return * @throws IOException */ public static Map<String,Integer> segmenteForTokenCount(Reader reader,boolean useSmart) throws IOException { IKSegmenter ik = new IKSegmenter(reader, useSmart);// 当为true时,分词器进行最大词长切分 Lexeme lexeme = null; Map<String,Integer> map = new HashMap<String,Integer>(); while ((lexeme = ik.next()) != null) { Integer count = map.get(lexeme.getLexemeText()); if(count == null) { count = 0; } count++; map.put(lexeme.getLexemeText(),count); } return map; } public static Map<String,Integer> segmenteForTokenCount(String string) { return segmenteForTokenCount(string,false); } /** * 解析得到每个词的次数,返回 Map<词,出现次数> * @param reader * @param useSmart * @return * @throws IOException */ public static Map<String,Integer> segmenteForTokenCount(String string,boolean useSmart) { try { return segmenteForTokenCount(new StringReader(string),useSmart); } catch (Exception e) { throw new RuntimeException("segmente error,string:"+string,e); } } /** * 将Map<token,count>转换为List<TokenList>并排序回去,序列是降序 * @param map * @return */ public static List<TokenCount> toSortedTokenCountList(Map<String,Integer> tokenCountMap) { List<TokenCount> tokens = new ArrayList<TokenCount>(); for(Map.Entry<String, Integer> entry : tokenCountMap.entrySet()) { tokens.add(new TokenCount(entry.getKey(),entry.getValue())); } Collections.sort(tokens); return tokens; } public static class TokenCount implements Comparable<TokenCount>{ private String token; private int count; public TokenCount() { } public TokenCount(String token, int count) { super(); this.token = token; this.count = count; } public String getToken() { return token; } public void setToken(String token) { this.token = token; } public int getCount() { return count; } public void setCount(int count) { this.count = count; } @Override public String toString() { return "TokenCount [token=" + token + ", count=" + count + "]"; } @Override public int compareTo(TokenCount o) { if(count == o.count) { return 0; } if(count > o.getCount()) { return -1; }else { return 1; } } } }