package org.ansj.library; import java.io.BufferedReader; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import java.util.Map.Entry; import org.ansj.dic.PathToStream; import org.ansj.domain.KV; import org.ansj.util.MyStaticValue; import org.nlpcn.commons.lang.tire.domain.SmartForest; import org.nlpcn.commons.lang.util.IOUtil; import org.nlpcn.commons.lang.util.StringUtil; import org.nlpcn.commons.lang.util.logging.Log; public class SynonymsLibrary { private static final Log LOG = MyStaticValue.getLog(SynonymsLibrary.class); // 同义词典 private static final Map<String, KV<String, SmartForest<List<String>>>> SYNONYMS = new HashMap<>(); public static final String DEFAULT = "synonyms"; static { for (Entry<String, String> entry : MyStaticValue.ENV.entrySet()) { if (entry.getKey().startsWith(DEFAULT)) { put(entry.getKey(), entry.getValue()); } } putIfAbsent(DEFAULT, "library/synonyms.dic"); } public static SmartForest<List<String>> get() { return get(DEFAULT); } /** */ public static SmartForest<List<String>> get(String key) { KV<String, SmartForest<List<String>>> kv = SYNONYMS.get(key); if (kv == null) { if (MyStaticValue.ENV.containsKey(key)) { putIfAbsent(key, MyStaticValue.ENV.get(key)); return get(key); } LOG.warn("crf " + key + " not found in config "); return null; } SmartForest<List<String>> sw = kv.getV(); if (sw == null) { sw = init(key, kv, false); } return sw; } /** * 加载词典 * * @param key * @param kv * @param reload 是否更新词典 * @return */ private static synchronized SmartForest<List<String>> init(String key, KV<String, SmartForest<List<String>>> kv, boolean reload) { SmartForest<List<String>> forest = kv.getV(); if (forest != null) { if (reload) { forest.clear(); } else { return forest; } } else { forest = new SmartForest<>(); } LOG.debug("begin init synonyms " + kv.getK()); long start = System.currentTimeMillis(); try (BufferedReader reader = IOUtil.getReader(PathToStream.stream(kv.getK()), IOUtil.UTF8)) { String temp = null; while ((temp = reader.readLine()) != null) { if (StringUtil.isBlank(temp)) { continue; } String[] split = temp.split("\t"); List<String> list = new ArrayList<>(); for (String word : split) { if (StringUtil.isBlank(word)) { continue; } list.add(word); } if (split.length <= 1) { LOG.warn(temp + " in synonymsLibrary not in to library !"); continue; } for (int i = 0; i < split.length; i++) { forest.add(split[i], list); } } kv.setV(forest); LOG.info("load synonyms use time:" + (System.currentTimeMillis() - start) + " path is : " + kv.getK()); return forest; } catch (Exception e) { LOG.error("Init synonyms library error :" + e.getMessage() + ", path: " + kv.getK()); SYNONYMS.remove(key); return null; } } /** * 动态添加 * * @param dicDefault * @param dicDefault2 * @param dic2 */ public static void put(String key, String path) { put(key, path, null); } public static void put(String key, String path, SmartForest<List<String>> value) { SYNONYMS.put(key, KV.with(path, value)); MyStaticValue.ENV.put(key, path); } /** * 删除一个key * * @param key * @return */ public static KV<String, SmartForest<List<String>>> remove(String key) { KV<String, SmartForest<List<String>>> kv = SYNONYMS.get(key); if (kv != null && kv.getV() != null) { //先清空后删除 kv.getV().clear(); } MyStaticValue.ENV.remove(key) ; return SYNONYMS.remove(key); } /** * 刷新一个,将值设置为null * * @param key * @return */ public static void reload(String key) { if (!MyStaticValue.ENV.containsKey(key)) { //如果变量中不存在直接删掉这个key不解释了 remove(key); } putIfAbsent(key, MyStaticValue.ENV.get(key)); KV<String, SmartForest<List<String>>> kv = SYNONYMS.get(key); init(key, kv, true); } public static Set<String> keys() { return SYNONYMS.keySet(); } public static void putIfAbsent(String key, String path) { if (!SYNONYMS.containsKey(key)) { SYNONYMS.put(key, KV.with(path, (SmartForest<List<String>>) null)); } } /** * 覆盖更新同义词 [中国, 中华, 我国] -> replace([中国,华夏]) -> [中国,华夏] * * @param words */ public static void insert(String key, String[] words) { SmartForest<List<String>> synonyms = get(key); List<String> list = new ArrayList<>(); for (String word : words) { if (StringUtil.isBlank(word)) { continue; } list.add(word); } if (list.size() <= 1) { LOG.warn(Arrays.toString(words) + " not have any change because it less than 2 word"); return; } Set<String> set = findAllWords(key, words); for (String word : list) { set.remove(word); synonyms.add(word, list); } for (String word : set) { //删除所有 synonyms.remove(word); synonyms.getBranch(word).setParam(null); } } private static Set<String> findAllWords(String key, String[] words) { SmartForest<List<String>> synonyms = get(key); Set<String> set = new HashSet<>(); for (String word : words) { SmartForest<List<String>> branch = synonyms.getBranch(word); if (branch != null) { List<String> params = branch.getParam(); if (params != null) { set.addAll(params); } } } return set; } /** * 合并更新同义词 覆盖更新同义词 [中国, 中华, 我国] -> append([中国,华夏]) -> [中国, 中华, 我国 , 华夏] * * @param words */ public static void append(String key, String[] words) { SmartForest<List<String>> synonyms = get(key); Set<String> set = new HashSet<>(); for (String word : words) { if (StringUtil.isBlank(word)) { continue; } set.add(word); } if (set.size() <= 1) { LOG.warn(Arrays.toString(words) + " not have any change because it less than 2 word"); return; } set.addAll(findAllWords(key, words)); List<String> list = new ArrayList<>(set); for (String word : list) { synonyms.addBranch(word, list); } } /** * 从同义词组中删除掉一个词 [中国, 中华, 我国] -> remove(我国) -> [中国, 中华] * * @param words */ public static void remove(String key, String word) { SmartForest<List<String>> synonyms = get(key); SmartForest<List<String>> branch = synonyms.getBranch(word); if (branch == null || branch.getStatus() < 2) { return; } List<String> params = branch.getParam(); synonyms.remove(word); branch.setParam(null); params.remove(word); if (params.size() == 1) { //如果是1 个也删除 synonyms.remove(params.get(0)); params.remove(0); } else { params.remove(word); } } }