package com.blinkcoder.kit; import com.jfinal.log.Logger; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; import org.wltea.analyzer.core.IKSegmenter; import org.wltea.analyzer.core.Lexeme; import java.io.IOException; import java.io.StringReader; import java.util.ArrayList; import java.util.List; /** * User: Michael Chen * Email: yidongnan@gmail.com * Date: 14-3-3 * Time: 下午2:09 */ public class IKAnalyzerKit { private final static Logger log = Logger.getLogger(IKAnalyzerKit.class); @SuppressWarnings("unchecked") private final static List<String> nowords = new ArrayList<String>() {{ try { addAll(IOUtils.readLines(IKAnalyzerKit.class.getResourceAsStream ("/stopword.dic"))); } catch (IOException e) { log.error("Unabled to read stopword file", e); } }}; @SuppressWarnings("unchecked") private final static List<String> ReserveKeys = new ArrayList<String>() {{ try { addAll(IOUtils.readLines(IKAnalyzerKit.class.getResourceAsStream ("/keywords.dic"))); } catch (IOException e) { log.error("Unabled to read keywords file", e); } }}; public static String cleanupKey(String key) { if (ReserveKeys.contains(key.trim().toLowerCase())) return key; StringBuilder sb = new StringBuilder(); List<String> keys = splitKeywords(key); for (String word : keys) { if (sb.length() > 0) sb.append(' '); sb.append(word); } return sb.toString(); } public static List<String> splitKeywords(String sentence) { List<String> keys = new ArrayList<>(); if (StringUtils.isNotBlank(sentence)) { StringReader reader = new StringReader(sentence); IKSegmenter ikseg = new IKSegmenter(reader, true); try { do { Lexeme me = ikseg.next(); if (me == null) break; String term = me.getLexemeText(); if (nowords.contains(term.toLowerCase())) continue; keys.add(term); } while (true); } catch (IOException e) { log.error("Unable to split keywords", e); } } return keys; } }