/* * Copyright 2011-2013 the original author or authors. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.analysis.kr.utils; import org.apache.lucene.analysis.kr.morph.CompoundEntry; import org.apache.lucene.analysis.kr.morph.MorphException; import org.apache.lucene.analysis.kr.morph.WordEntry; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.concurrent.Future; @SuppressWarnings( "unchecked" ) public class DictionaryUtil { private static final Logger log = LoggerFactory.getLogger(DictionaryUtil.class); private static Trie<String, WordEntry> dictionary = new Trie<String, WordEntry>(true); private static HashMap<String, String> josas = new HashMap<String, String>(); private static HashMap<String, String> eomis = new HashMap<String, String>(); private static HashMap<String, String> prefixs = new HashMap<String, String>(); private static HashMap<String, String> suffixs = new HashMap<String, String>(); private static HashMap<String, WordEntry> uncompounds = new HashMap<String, WordEntry>(); private static HashMap<String, String> cjwords = new HashMap<String, String>(); static { loadDictionary(); readFile(josas, KoreanEnv.FILE_JOSA); readFile(eomis, KoreanEnv.FILE_EOMI); readFile(prefixs, KoreanEnv.FILE_PREFIX); readFile(suffixs, KoreanEnv.FILE_SUFFIX); // uncompunds char[] features = "90000X".toCharArray(); List<String> lines = FileUtil.readLines(KoreanEnv.getInstance().getValue(KoreanEnv.FILE_UNCOMPOUNDS), KoreanEnv.UTF8); for (String compound : lines) { String[] infos = StringUtil.split(compound, ":"); if (infos.length != 2) continue; WordEntry entry = new WordEntry(infos[0].trim(), features); entry.setCompounds(compoundArrayToList(infos[1], StringUtil.split(infos[1], ","))); uncompounds.put(entry.getWord(), entry); } lines = FileUtil.readLines(KoreanEnv.getInstance().getValue(KoreanEnv.FILE_CJ), KoreanEnv.UTF8); for (String cj : lines) { String[] infos = StringUtil.split(cj, ":"); if (infos.length != 2) continue; cjwords.put(infos[0], infos[1]); } } /** 사전을 로드한다. */ public synchronized static void loadDictionary() throws MorphException { log.info("사전을 로드합니다..."); log.info("표준 사전을 로드합니다..."); Future<List<String>> standardDic = FileUtil.readLinesAsync(KoreanEnv.getInstance().getValue(KoreanEnv.FILE_DICTIONARY), KoreanEnv.UTF8); log.info("복합명사 사전을 로드합니다..."); Future<List<String>> compoundDic = FileUtil.readLinesAsync(KoreanEnv.getInstance().getValue(KoreanEnv.FILE_COMPOUNDS), KoreanEnv.UTF8); log.info("확장 사전을 로드합니다..."); Future<List<String>> extensionDic = FileUtil.readLinesAsync(KoreanEnv.getInstance().getValue(KoreanEnv.FILE_EXTENSION), KoreanEnv.UTF8); log.info("사용자 정의 사전을 로드합니다..."); Future<List<String>> customDic = FileUtil.readLinesAsync(KoreanEnv.getInstance().getValue(KoreanEnv.FILE_CUSTOM), KoreanEnv.UTF8); try { log.info("표준 사전을 파싱합니다..."); List<String> standards = standardDic.get(); // List<String> standards = FileUtil.readLines(KoreanEnv.getInstance().getValue(KoreanEnv.FILE_DICTIONARY), "UTF-8"); int count = 0; for (String str : standards) { String[] infos = StringUtil.split(str, ","); if (infos.length != 2) continue; infos[1] = infos[1].trim(); if (infos[1].length() == 6) infos[1] = infos[1].substring(0, 5) + "000" + infos[1].substring(5); if (dictionary.get(infos[0].trim()) == null) { WordEntry entry = new WordEntry(infos[0].trim(), infos[1].trim().toCharArray()); dictionary.add(entry.getWord(), entry); count++; } } log.info("표준 사전을 빌드했습니다. 단어수=[{}], 등록수=[{}]", standards.size(), count); } catch (Exception e) { log.error("표준 사전을 로드하는데 실패했습니다.", e); throw new MorphException(e); } try { log.info("복합명사 사전을 파싱합니다..."); List<String> compounds = compoundDic.get(); // List<String> compounds = FileUtil.readLines(KoreanEnv.getInstance().getValue(KoreanEnv.FILE_COMPOUNDS), "UTF-8"); char[] features = "20000000X".toCharArray(); int count = 0; for (String compound : compounds) { String[] infos = StringUtil.split(compound, ":"); if (infos.length != 2) continue; if (dictionary.get(infos[0].trim()) == null) { WordEntry entry = new WordEntry(infos[0].trim(), features); entry.setCompounds(compoundArrayToList(infos[1], StringUtil.split(infos[1], ","))); dictionary.add(entry.getWord(), entry); count++; } } log.info("복합명사 사전을 빌드했습니다. 단어수=[{}], 등록수=[{}]", compounds.size(), count); } catch (Exception e) { log.error("복합명사 사전을 로드하는데 실패했습니다.", e); throw new MorphException(e); } try { log.info("확장 사전을 파싱합니다..."); List<String> extensions = extensionDic.get(); // List<String> extensions = FileUtil.readLines(KoreanEnv.getInstance().getValue(KoreanEnv.FILE_EXTENSION), "UTF-8"); int count = 0; for (String str : extensions) { String[] infos = StringUtil.split(str, ","); if (infos.length != 2) continue; infos[1] = infos[1].trim(); if (infos[1].length() == 6) infos[1] = infos[1].substring(0, 5) + "000" + infos[1].substring(5); if (dictionary.get(infos[0].trim()) == null) { WordEntry entry = new WordEntry(infos[0].trim(), infos[1].trim().toCharArray()); dictionary.add(entry.getWord(), entry); count++; } } log.info("확장 사전을 빌드했습니다. 단어수=[{}], 등록수=[{}]", extensions.size(), count); } catch (Exception e) { log.error("확장 사전을 로드하는데 실패했습니다.", e); throw new MorphException(e); } try { log.info("사용자정의 사전을 파싱합니다..."); List<String> customs = customDic.get(); // final List<String> customs = FileUtil.readLines(KoreanEnv.getInstance().getValue(KoreanEnv.FILE_CUSTOM), "UTF-8"); char[] features = "100000000X".toCharArray(); int count = 0; for (String custom : customs) { if (custom != null && custom.trim().length() > 0) { if (dictionary.get(custom.trim()) == null) { WordEntry entry = new WordEntry(custom.trim(), features); dictionary.add(entry.getWord(), entry); count++; } } } log.info("사용자정의 사전을 빌드했습니다. 단어수=[{}], 등록수=[{}]", customs.size(), count); } catch (Exception e) { log.error("사용자정의 사전을 로드하는데 실패했습니다.", e); throw new MorphException(e); } log.info("사전을 빌드했습니다."); } public static Iterator findWithPrefix(String prefix) throws MorphException { return dictionary.getPrefixedBy(prefix); } public static WordEntry getWord(String key) throws MorphException { if (key.length() == 0) return null; return (WordEntry) dictionary.get(key); } public static WordEntry getWordExceptVerb(String key) throws MorphException { WordEntry entry = getWord(key); if (entry == null) return null; if (entry.getFeature(WordEntry.IDX_NOUN) == '1' || entry.getFeature(WordEntry.IDX_BUSA) == '1') return entry; return null; } public static WordEntry getNoun(String key) throws MorphException { WordEntry entry = getWord(key); if (entry == null) return null; if (entry.getFeature(WordEntry.IDX_NOUN) == '1') return entry; return null; } public static WordEntry getCNoun(String key) throws MorphException { WordEntry entry = getWord(key); if (entry == null) return null; if (entry.getFeature(WordEntry.IDX_NOUN) == '1' || entry.getFeature(WordEntry.IDX_NOUN) == '2') return entry; return null; } public static WordEntry getVerb(String key) throws MorphException { WordEntry entry = getWord(key); if (entry == null) return null; if (entry.getFeature(WordEntry.IDX_VERB) == '1') { return entry; } return null; } public static WordEntry getAdverb(String key) throws MorphException { WordEntry entry = getWord(key); if (entry == null) return null; if (entry.getFeature(WordEntry.IDX_BUSA) == '1') return entry; return null; } public static WordEntry getBusa(String key) throws MorphException { WordEntry entry = getWord(key); if (entry == null) return null; if (entry.getFeature(WordEntry.IDX_BUSA) == '1' && entry.getFeature(WordEntry.IDX_NOUN) == '0') return entry; return null; } public static WordEntry getIrrVerb(String key, char irrType) throws MorphException { WordEntry entry = getWord(key); if (entry == null) return null; if (entry.getFeature(WordEntry.IDX_VERB) == '1' && entry.getFeature(WordEntry.IDX_REGURA) == irrType) return entry; return null; } public static WordEntry getBeVerb(String key) throws MorphException { WordEntry entry = getWord(key); if (entry == null) return null; if (entry.getFeature(WordEntry.IDX_BEV) == '1') return entry; return null; } public static WordEntry getDoVerb(String key) throws MorphException { WordEntry entry = getWord(key); if (entry == null) return null; if (entry.getFeature(WordEntry.IDX_DOV) == '1') return entry; return null; } public synchronized static WordEntry getUncompound(String key) throws MorphException { return uncompounds.get(key); } public synchronized static String getCJWord(String key) throws MorphException { return cjwords.get(key); } public static boolean existJosa(String str) throws MorphException { return josas.get(str) != null; } public static boolean existEomi(String str) throws MorphException { return (eomis.get(str) != null); } public static boolean existPrefix(String str) throws MorphException { return prefixs.get(str) != null; } public static boolean existSuffix(String str) throws MorphException { return suffixs.get(str) != null; } /** ㄴ,ㄹ,ㅁ,ㅂ과 eomi 가 결합하여 어미가 될 수 있는지 점검한다. */ public static String combineAndEomiCheck(char s, String eomi) throws MorphException { if (eomi == null) eomi = ""; if (s == 'ㄴ') eomi = "은" + eomi; else if (s == 'ㄹ') eomi = "을" + eomi; else if (s == 'ㅁ') eomi = "음" + eomi; else if (s == 'ㅂ') eomi = "습" + eomi; else eomi = s + eomi; if (existEomi(eomi)) return eomi; return null; } /** * 사전 파일에서 항목을 읽어 사전으로 빌드합니다. * * @throws org.apache.lucene.analysis.kr.morph.MorphException * */ private static void readFile(HashMap<String, String> map, String dic) throws MorphException { String filename = KoreanEnv.getInstance().getValue(dic); try { List<String> lines = FileUtil.readLines(filename, KoreanEnv.UTF8); for (final String line : lines) { map.put(line.trim(), line); } log.info("사전 파일에서 [{}]개를 읽어, [{}]개를 등록했습니다. filename=[{}]", lines.size(), map.size(), filename); } catch (Exception e) { throw new MorphException(e); } } private static List compoundArrayToList(String source, String[] arr) { List list = new ArrayList(); for (String str : arr) { CompoundEntry ce = new CompoundEntry(str); ce.setOffset(source.indexOf(str)); list.add(ce); } return list; } }