package com.fpcms.common.util; import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.Collections; import java.util.Comparator; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import java.util.StringTokenizer; import org.apache.commons.collections.comparators.ReverseComparator; import org.apache.commons.io.IOUtils; import org.apache.commons.lang.StringUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; public class KeywordUtil { private static Logger logger = LoggerFactory.getLogger(KeywordUtil.class); /** 常用标点符号 */ public static String COMMON_SYMBOLES = ",.?;!,。?;!"; public static String DELIMITERS = " \t\n\r\f,.!?;:'/\"\\()+=-_<>,。!?;:、=+-——/·#—¥%—…—*()‘“”~`《》@#$%^&*~`|\\【】"; /** * 敏感词 */ private static Set<String> sensitiveKeywordSet = readKeywords("/keyword/sensitive_keyword.txt"); /** * 非名词 */ private static Set<String> nonNameKeywordSet = readKeywords("/keyword/nonName.txt"); { logger.info("sensitive_keyword:"+sensitiveKeywordSet); } /** * 得到常用标点符号的个数 * @param str * @return */ public static int getCommonSymbolsCount(String str) { if(StringUtils.isBlank(str)) { return 0; } int result = 0; for(int i = 0; i < str.length(); i++) { char c = str.charAt(i); if(COMMON_SYMBOLES.indexOf(c) >= 0) { result++; } } return result; } /** * 得到最大长度的字符串 * @param keywords * @return */ public static String getMaxLengthToken(String keywords) { if(StringUtils.isBlank(keywords)) { return keywords; } List<String> keywordsList = KeywordUtil.toTokenizerList(keywords); Collections.sort(keywordsList,new ReverseComparator(new StringLengthComparator())); String maxLengthKeyword = keywordsList.get(0); return maxLengthKeyword; } public static String getPerfectKeyword(String content,String keyword) { if(StringUtils.isBlank(content)) { return null; } if(StringUtils.isBlank(keyword)) { return null; } ArrayList<String> tokens = toTokenizerList(content); Collections.sort(tokens, new Comparator<String>() { @Override public int compare(String o1, String o2) { return o2.length() - o1.length(); } }); for(String token : tokens) { // if(token.matches(".*\\d{4}.*")) { // continue; // } if(token.contains(keyword)) { return token; } } return null; } @SuppressWarnings("unchecked") public static ArrayList<String> toTokenizerList(String content) { StringTokenizer tokenizer = new StringTokenizer(content,KeywordUtil.DELIMITERS); ArrayList list = Collections.list(tokenizer); return (ArrayList<String>)list; } /** * 过滤敏感词 * @param list */ public static void filterSensitiveKeyword(Collection<String> list) { List<String> removeItems = new ArrayList<String>(); for(String str : list) { if(isSensitiveKeyword(str)) { removeItems.add(str); } } for(String str : removeItems) { list.remove(str); } } /** * 是否敏感关键字 * @param keyword * @return */ public static boolean isSensitiveKeyword(String keyword) { return getSensitiveKeyword(keyword) == null ? false : true; } /** * 得到敏感关键字 * @param keyword * @return */ public static String getSensitiveKeyword(String keyword) { if(StringUtils.isBlank(keyword)) { return null; } for(String str : sensitiveKeywordSet) { if(keyword.contains(str)) { return str; } } return null; } /** * 得到敏感关键字 * @param keyword * @return */ public static void assertSensitiveKeyword(String keyword) { String sensitiveKeyword = getSensitiveKeyword(keyword); if(sensitiveKeyword != null) { throw new IllegalArgumentException("sensitiveKeyword:"+sensitiveKeyword+" on String:"+keyword); } } /** * 过滤掉非名词 * @param list */ public static void filterNonNameKeyword(Collection<String> list) { List<String> removeItems = new ArrayList<String>(); for(String str : list) { if(!isNameKeyword(str)) { removeItems.add(str); } } for(String str : removeItems) { list.remove(str); } } /** * 判断一个词是否是名词 * @param keyword * @return */ public static boolean isNameKeyword(String keyword) { for(String str : nonNameKeywordSet) { if(keyword.contains(str)) { return false; } } return true; } public static Set<String> readKeywords(String classpathResource) { InputStream input = KeywordUtil.class.getResourceAsStream(classpathResource); try { List<String> lines = IOUtils.readLines(input); Set set = new HashSet(); for(String keywords : lines) { String[] array = org.springframework.util.StringUtils.tokenizeToStringArray(keywords,DELIMITERS); set.addAll(Arrays.asList(array)); } return set; }catch(IOException e) { throw new RuntimeException("load error,"+classpathResource,e); }finally { IOUtils.closeQuietly(input); } } public static String getRandomKeyword(String keywords) { List<String> keywordList = KeywordUtil.toTokenizerList(keywords); if(keywordList.isEmpty()) { return null; } return RandomUtil.randomSelect(keywordList); } public static int getMaxRank(String keywords,String site) { String[] keywordsArray = org.springframework.util.StringUtils.tokenizeToStringArray(keywords, ",_| "); Map<String,Integer> keywordRankMap = SearchEngineUtil.baiduKeywordsRank(keywords, site); if(keywordRankMap.isEmpty()) { return 0; } int min = Integer.MAX_VALUE; for(String keyword : keywordRankMap.keySet()) { int rank = keywordRankMap.get(keyword); if(rank > 0 && rank < min) { min = rank; } if(rank > 0) { logger.info("rank_baidu:"+rank+" site:"+site); } } return min == Integer.MAX_VALUE ? 0 : min; } static String[] a = {"啊","呀","嗯","啦","哪","吧"}; static String[] question = {"吗"}; static String[] symbols = {",",".","!",";"}; public static Object getSymbol(String token) { for(int i = 0; i < DELIMITERS.length(); i++) { char c = DELIMITERS.charAt(i); if(token.endsWith(""+c)) { return ""; } } for(String item : a) { if(token.endsWith(item)) { return "!"; } } for(String item : question) { if(token.endsWith(item)) { return "?"; } } return RandomUtil.randomSelect(symbols); } }