/* * Copyright 2011-2013 the original author or authors. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.analysis.kr.tagging; import org.apache.lucene.analysis.kr.morph.AnalysisOutput; import org.apache.lucene.analysis.kr.morph.MorphException; import org.apache.lucene.analysis.kr.morph.PatternConstants; import org.apache.lucene.analysis.kr.utils.*; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.util.ArrayList; import java.util.Iterator; import java.util.List; /** * 여러개의 형태소분석 결과 중에 최적의 것을 선택한다. * 이 함수는 문장단위로 호출되어야 한다. * * @author user */ public class Tagger { private static final Logger log = LoggerFactory.getLogger(Tagger.class); private static Trie<String, String[]> occurrences = new Trie<String, String[]>(true); static { loadTaggerDic(); } private static final String tagDicLoc = "tagger.dic"; private static final String NILL = "NILL"; private static final String NOPATN = "0"; private AnalysisOutput po; public AnalysisOutput tagging(String psource, List<AnalysisOutput> pmorphs) throws MorphException { return tagging(psource, null, pmorphs, null); } public AnalysisOutput tagging(String psource, String rsource, List<AnalysisOutput> pmorphs, List<AnalysisOutput> rmorphs) throws MorphException { if ((pmorphs == null || pmorphs.size() == 0) && (rmorphs == null || rmorphs.size() == 0)) return null; po = lookupBest(psource, rsource, pmorphs, rmorphs); po.setSource(psource); return po; } /** * po가 NULL이 아닌 경우만 호출된다. * occurrence.dic 에 등록되어 있는 경우만.. 최적을 찾아서 반환한다. * 1. 첫번째는 어간으로 시작되는 문법 규칙을 찾는다. * 2. 두번째는 표층형으로 시작되는 문법규칙을 찾는다. */ private AnalysisOutput lookupBest(String psource, String rsource, List<AnalysisOutput> pmorphs, List<AnalysisOutput> rmorphs) throws MorphException { if (pmorphs.size() == 1) return pmorphs.get(0); AnalysisOutput select = null; if (rmorphs != null && rmorphs.size() != 0) select = lookupBestByRWord(psource, rsource, pmorphs, rmorphs); if (select != null) return select; if (po != null) select = lookupBestByPWord(psource, pmorphs); if (select != null) return select; return pmorphs.get(0); } /** * 앞 어절에 의해 현재 어절을 결정한다. * 앞 어절은 NULL이 아니다. * * @throws org.apache.lucene.analysis.kr.morph.MorphException * */ private AnalysisOutput lookupBestByPWord(String rsource, List<AnalysisOutput> rmorphs) throws MorphException { List<AnalysisOutput> removes = new ArrayList<AnalysisOutput>(); for (AnalysisOutput morph : rmorphs) { Iterator<String[]> iterw = getGR("F" + rsource + "^W"); AnalysisOutput best = selectBest(iterw, po.getSource(), rsource, po, morph, true, removes); if (best != null) return best; Iterator<String[]> iters = getGR("F" + morph.getStem() + "^S"); best = selectBest(iters, po.getSource(), rsource, po, morph, true, removes); if (best != null) return best; } for (AnalysisOutput morph : removes) { if (rmorphs.size() > 1) rmorphs.remove(morph); } return null; } /** * 뒷 어절에 의해 현재 어절이 결정된다. * 뒷 어절은 NULL이 아니다. * * @throws org.apache.lucene.analysis.kr.morph.MorphException * */ private AnalysisOutput lookupBestByRWord(String psource, String rsource, List<AnalysisOutput> pmorphs, List<AnalysisOutput> rmorphs) throws MorphException { List<AnalysisOutput> removes = new ArrayList<AnalysisOutput>(); for (AnalysisOutput rmorph : rmorphs) { if (rmorph.getScore() != AnalysisOutput.SCORE_CORRECT) break; String rend = rmorph.getJosa(); if (rend == null) rend = rmorph.getEomi(); for (AnalysisOutput pmorph : pmorphs) { Iterator<String[]> iterw = getGR("R" + psource + "^W/"); String pend = pmorph.getJosa(); if (pend == null) pend = pmorph.getEomi(); AnalysisOutput best = selectBest(iterw, psource, rsource, pmorph, rmorph, false, removes); if (best != null) return best; Iterator<String[]> iters = getGR("R" + NILL + "/" + pend + "/"); best = selectBest(iters, psource, rsource, pmorph, rmorph, false, removes); if (best != null) return best; iters = getGR("R" + pmorph.getStem() + "^S/"); best = selectBest(iters, psource, rsource, pmorph, rmorph, false, removes); if (best != null) return best; } } for (AnalysisOutput morph : removes) { if (pmorphs.size() > 1) pmorphs.remove(morph); } return null; } private AnalysisOutput selectBest(Iterator<String[]> iter, String psource, String rsource, AnalysisOutput pmorph, AnalysisOutput rmorph, boolean rear, List<AnalysisOutput> removes) { while (iter.hasNext()) { String[] values = iter.next(); if (checkGrammer(values, psource, rsource, pmorph, rmorph, rear)) { if (rear) return rmorph; else return pmorph; } else if ("1".equals(values[6])) { if (!removes.contains(pmorph)) removes.add(pmorph); break; } } return null; } private boolean checkGrammer(String[] values, String psource, String rsource, AnalysisOutput pmorph, AnalysisOutput rmorph, boolean depFront) { boolean ok = true; String pend = pmorph.getJosa(); if (pend == null) pend = pmorph.getEomi(); String rend = rmorph.getJosa(); if (rend == null) rend = rmorph.getEomi(); if (depFront && !NILL.equals(values[0]) && !checkWord(psource, values[0], pmorph)) { // 앞 어절의 어휘 return false; } if (!NILL.equals(values[1]) && !checkEomi(values[1], pend)) { // 앞 어절의 어미 return false; } if (!NOPATN.equals(values[2]) && !checkPattern(values[2], pmorph.getPatn())) {// 앞 어절의 패턴 return false; } if (!depFront && !NILL.equals(values[3]) && !checkWord(rsource, values[3], rmorph)) { // 뒷 어절의 어휘 return false; } if (!NILL.equals(values[4]) && !checkEomi(values[4], rend)) { // 뒷 어절의 어미 return false; } if (!NOPATN.equals(values[5]) && !checkPattern(values[5], rmorph.getPatn())) { // 뒷 어절의 패턴 return false; } return true; } private boolean checkWord(String source, String value, AnalysisOutput morph) { String[] types = StringUtil.split(value, "^"); String[] strs = StringUtil.split(types[0], ","); String text = source; if ("S".equals(types[1])) text = morph.getStem(); for (String str : strs) { if (str.equals(text)) return true; } return false; } private boolean checkEomi(String value, String rend) { String[] strs = StringUtil.split(value, ","); for (String str : strs) { if (str.equals(rend)) return true; } return false; } private boolean checkPattern(String value, int ptn) { String[] strs = StringUtil.split(value, ","); String strPtn = Integer.toString(ptn); for (String str : strs) { if ("E".equals(str) && ConstraintUtil.isEomiPhrase(ptn)) return true; else if ("J".equals(str) && (ConstraintUtil.isJosaNounPhrase(ptn) || ptn == PatternConstants.PTN_N)) return true; else if (str.equals(strPtn)) return true; } return false; } @SuppressWarnings( "unchecked" ) public static synchronized Iterator<String[]> getGR(String prefix) throws MorphException { return (Iterator<String[]>) occurrences.getPrefixedBy(prefix); } private static void loadTaggerDic() throws MorphException { try { log.info("Tagger 사전을 읽어드립니다..."); List<String> strs = FileUtil.readLines(KoreanEnv.getInstance().getValue(tagDicLoc), KoreanEnv.UTF8); log.info("Tagger 사전을 파싱합니다..."); for (String str : strs) { if (str == null) continue; str = str.trim(); String[] syls = StringUtil.split(str, ":"); if (syls.length != 4) continue; String key = null; if ("F".equals(syls[0])) key = syls[2].substring(0, syls[2].lastIndexOf("/") + 1) + syls[1].substring(0, syls[1].lastIndexOf("/")); else key = syls[1].substring(0, syls[1].lastIndexOf("/") + 1) + syls[2].substring(0, syls[2].lastIndexOf("/")); String[] patns = StringUtil.split(syls[1] + "/" + syls[2] + "/" + syls[3], "/"); occurrences.add(syls[0] + key, patns); } log.info("Tagger 사진을 빌드했습니다."); } catch (Exception e) { throw new MorphException("Fail to read the tagger dictionary.(" + tagDicLoc + ")\n" + e.getMessage()); } } }