/* * Copyright 2011-2013 the original author or authors. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.analysis.kr.utils; import org.apache.lucene.analysis.kr.morph.*; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.util.ArrayList; import java.util.Collections; import java.util.List; public class NounUtil { private static final Logger log = LoggerFactory.getLogger(NounUtil.class); private static final boolean isTraceEnabled = log.isTraceEnabled(); private static final boolean isDebugEnabled = log.isDebugEnabled(); private static final List<String> DNouns; static { DNouns = new ArrayList<String>(); Collections.addAll(DNouns, "등", "들", "상", "간", "뿐", "별"); } /** * 어간부가 음/기 로 끝나는 경우 * * @throws org.apache.lucene.analysis.kr.morph.MorphException * */ public static boolean analysisMJ(AnalysisOutput o, List<AnalysisOutput> candidates) throws MorphException { int strlen = o.getStem().length(); if (strlen < 2) return false; char[] chrs = MorphUtil.decompose(o.getStem().charAt(strlen - 1)); boolean success = false; if (o.getStem().charAt(strlen - 1) != '기' && !(chrs.length == 3 && chrs[2] == 'ㅁ')) return false; String start = o.getStem(); String end = ""; if (o.getStem().charAt(strlen - 1) == '기') { start = o.getStem().substring(0, strlen - 1); end = "기"; } else if (o.getStem().charAt(strlen - 1) == '음') { start = o.getStem().substring(0, strlen - 1); end = "음"; } String[] eomis = EomiUtil.splitEomi(start, end); if (eomis[0] == null) return false; String[] pomis = EomiUtil.splitPomi(eomis[0]); o.setStem(pomis[0]); o.addElist(eomis[1]); o.setPomi(pomis[1]); try { if (analysisVMJ(o.clone(), candidates)) return true; if (analysisVMXMJ(o.clone(), candidates)) return true; if (analysisNSMJ(o.clone(), candidates)) return true; } catch (CloneNotSupportedException e) { throw new MorphException(e.getMessage(), e); } if (DictionaryUtil.getVerb(o.getStem()) != null) { o.setPos(PatternConstants.POS_VERB); o.setPatn(PatternConstants.PTN_VMJ); o.setScore(AnalysisOutput.SCORE_CORRECT); candidates.add(o); return true; } return false; } /** * 용언 + '음/기' + 조사(PTN_VMXMJ) * * @throws org.apache.lucene.analysis.kr.morph.MorphException * */ public static boolean analysisVMJ(AnalysisOutput o, List<AnalysisOutput> candidates) throws MorphException { String[] irrs = IrregularUtil.restoreIrregularVerb(o.getStem(), o.getElist().get(0)); if (irrs != null) { o.setStem(irrs[0]); o.setElist(irrs[1], 0); } if (DictionaryUtil.getVerb(o.getStem()) != null) { o.setPatn(PatternConstants.PTN_VMJ); o.setPos(PatternConstants.POS_VERB); o.setScore(AnalysisOutput.SCORE_CORRECT); candidates.add(o); return true; } return false; } /** * 용언 + '아/어' + 보조용언 + '음/기' + 조사(PTN_VMXMJ) * * @throws org.apache.lucene.analysis.kr.morph.MorphException * */ public static boolean analysisVMXMJ(AnalysisOutput o, List<AnalysisOutput> candidates) throws MorphException { int idxXVerb = VerbUtil.endsWithXVerb(o.getStem()); if (idxXVerb != -1) { // 2. 사랑받아보다 String eogan = o.getStem().substring(0, idxXVerb); o.setXverb(o.getStem().substring(idxXVerb)); String[] stomis = null; if (eogan.endsWith("아") || eogan.endsWith("어")) stomis = EomiUtil.splitEomi(eogan.substring(0, eogan.length() - 1), eogan.substring(eogan.length() - 1)); else stomis = EomiUtil.splitEomi(eogan, ""); if (stomis[0] == null) return false; String[] irrs = IrregularUtil.restoreIrregularVerb(stomis[0], stomis[1]); if (irrs != null) { o.setStem(irrs[0]); o.addElist(irrs[1]); } else { o.setStem(stomis[0]); o.addElist(stomis[1]); } if (DictionaryUtil.getVerb(o.getStem()) != null) { o.setPatn(PatternConstants.PTN_VMXMJ); o.setPos(PatternConstants.POS_VERB); o.setScore(AnalysisOutput.SCORE_CORRECT); candidates.add(o); return true; } else if (analysisNSMXMJ(o, candidates)) { return true; } } return false; } /** * 체언 + 용언화접미사 + '음/기' + 조사 (PTN_NSMJ) * * @throws org.apache.lucene.analysis.kr.morph.MorphException * */ public static boolean analysisNSMJ(AnalysisOutput o, List<AnalysisOutput> candidates) throws MorphException { int idxVbSfix = VerbUtil.endsWithVerbSuffix(o.getStem()); if (idxVbSfix == -1) return false; o.setVsfx(o.getStem().substring(idxVbSfix)); o.setStem(o.getStem().substring(0, idxVbSfix)); o.setPatn(PatternConstants.PTN_NSMJ); o.setPos(PatternConstants.POS_NOUN); WordEntry entry = DictionaryUtil.getWordExceptVerb(o.getStem()); if (entry != null) { if (entry.getFeature(WordEntry.IDX_NOUN) == '0') return false; else if (o.getVsfx().equals("하") && entry.getFeature(WordEntry.IDX_DOV) != '1') return false; else if (o.getVsfx().equals("되") && entry.getFeature(WordEntry.IDX_BEV) != '1') return false; else if (o.getVsfx().equals("내") && entry.getFeature(WordEntry.IDX_NE) != '1') return false; o.setScore(AnalysisOutput.SCORE_CORRECT); // '입니다'인 경우 인명 등 미등록어가 많이 발생되므로 분석성공으로 가정한다. } else { o.setScore(AnalysisOutput.SCORE_ANALYSIS); // '입니다'인 경우 인명 등 미등록어가 많이 발생되므로 분석성공으로 가정한다. } candidates.add(o); return true; } public static boolean analysisNSMXMJ(AnalysisOutput o, List<AnalysisOutput> candidates) throws MorphException { int idxVbSfix = VerbUtil.endsWithVerbSuffix(o.getStem()); if (idxVbSfix == -1) return false; o.setVsfx(o.getStem().substring(idxVbSfix)); o.setStem(o.getStem().substring(0, idxVbSfix)); o.setPatn(PatternConstants.PTN_NSMXMJ); o.setPos(PatternConstants.POS_NOUN); WordEntry entry = DictionaryUtil.getWordExceptVerb(o.getStem()); if (entry != null) { if (entry.getFeature(WordEntry.IDX_NOUN) == '0') return false; else if (o.getVsfx().equals("하") && entry.getFeature(WordEntry.IDX_DOV) != '1') return false; else if (o.getVsfx().equals("되") && entry.getFeature(WordEntry.IDX_BEV) != '1') return false; else if (o.getVsfx().equals("내") && entry.getFeature(WordEntry.IDX_NE) != '1') return false; o.setScore(AnalysisOutput.SCORE_CORRECT); // '입니다'인 경우 인명 등 미등록어가 많이 발생되므로 분석성공으로 가정한다. } else { o.setScore(AnalysisOutput.SCORE_ANALYSIS); // '입니다'인 경우 인명 등 미등록어가 많이 발생되므로 분석성공으로 가정한다. } candidates.add(o); return true; } // /** // * 복합명사인지 조사하고, 복합명사이면 단위명사들을 찾는다. // * 복합명사인지 여부는 단위명사가 모두 사전에 있는지 여부로 판단한다. // * 단위명사는 2글자 이상 단어에서만 찾는다. // * @param o // * @return // * @throws org.apache.lucene.analysis.MorphException // */ // public static boolean confirmCNoun(AnalysisOutput o) throws MorphException { // // if(o.getStem().length()<3) return false; // if(o.getPatn()==PatternConstants.PTN_N // &&DictionaryUtil.existJosa(o.getStem().substring(o.getStem().length()-2))) return false; // // List<CompoundEntry> results = new ArrayList(); // List<List> queue = new ArrayList(); // String prefix = o.getStem().substring(0,1); // // int pos = 0; // boolean moreTwo = false; // while(pos<o.getStem().length()) { // // List<WordEntry> nList = findNouns(o.getStem().substring(pos),queue.size(),o); // if(nList==null) return false; // // if(pos==0&&DictionaryUtil.existPrefix(prefix)) nList.add(new WordEntry(prefix)); // // if(nList.size()==0) { // if(queue.size()==0) return false; // List<WordEntry> tmpList = queue.get(queue.size()-1); // // tmpList.remove(tmpList.size()-1); // pos -= results.get(queue.size()-1).getWord().length(); // if(tmpList.size()==0) { // while(tmpList.size()==0) { // results.remove(queue.size()-1); // queue.remove(tmpList); // if(queue.size()==0) return false; // // tmpList = queue.get(queue.size()-1); // tmpList.remove(tmpList.size()-1); // if(tmpList.size()==0) continue; // // pos -= results.get(queue.size()-1).getWord().length(); // results.set(queue.size()-1, new CompoundEntry(tmpList.get(tmpList.size()-1).getWord(),pos)); // pos += tmpList.get(tmpList.size()-1).getWord().length(); // // } // }else { // results.set(queue.size()-1, new CompoundEntry(tmpList.get(tmpList.size()-1).getWord(),pos)); // pos += tmpList.get(tmpList.size()-1).getWord().length(); // } // // } else { // queue.add(nList); // WordEntry noun = nList.get(nList.size()-1); // results.add(new CompoundEntry(noun.getWord(),pos)); // pos += noun.getWord().length(); // if(noun.getCompounds().size()>0) o.addCNoun(noun.getCompounds()); // if(noun.getWord().length()>1) moreTwo=true; // } // } // // if(results.size()>1&&DNouns.contains(results.get(results.size()-1).getWord())) { // CompoundEntry dnoun = results.remove(results.size()-1); // o.setStem(o.getStem().substring(0,o.getStem().length()-dnoun.getWord().length())); // o.setNsfx(dnoun.getWord()); // } // // if(results.size()>1) o.addCNoun(results); // // o.setScore(AnalysisOutput.SCORE_CORRECT); // return true; // } /** * 복합명사에서 단위명사를 분리해낸다. * 리스트의 가장 마지막에 위치한 단어가 최장단어이다. * * @param str 복합명사 * @param pos * @param o 분석결과 * @return 단위명사 리스트 * @throws org.apache.lucene.analysis.kr.morph.MorphException * */ private static List findNouns(String str, int pos, AnalysisOutput o) throws MorphException { if (isTraceEnabled) log.trace("복합명사에서 단위명사를 분리합니다. str=[{}], pos=[{}]", str, pos); List<WordEntry> nList = new ArrayList<WordEntry>(); if (str.length() == 2 && DictionaryUtil.existSuffix(str.substring(0, 1)) && DNouns.contains(str.substring(1))) { o.setStem(o.getStem().substring(0, o.getStem().length() - 1)); o.setNsfx(str.substring(1)); nList.add(new WordEntry(str.substring(0, 1))); return nList; } else if (str.length() == 2 && DictionaryUtil.existSuffix(str.substring(0, 1)) && DictionaryUtil.existJosa(str.substring(1))) { return null; } if (pos >= 2 && DictionaryUtil.existJosa(str)) return null; if (str.length() == 1 && (DictionaryUtil.existSuffix(str) || DNouns.contains(str))) { nList.add(new WordEntry(str)); return nList; } for (int i = 1; i < str.length(); i++) { String sub = str.substring(0, i + 1); if (!DictionaryUtil.findWithPrefix(sub).hasNext()) break; WordEntry entry = DictionaryUtil.getCNoun(sub); if (entry != null) { nList.add(entry); } } return nList; } /* * 마지막 음절이 명사형 접미사(등,상..)인지 조사한다. */ public static boolean confirmDNoun(AnalysisOutput output) throws MorphException { int strlen = output.getStem().length(); String d = output.getStem().substring(strlen - 1); if (!DNouns.contains(d)) return false; String s = output.getStem().substring(0, strlen - 1); output.setNsfx(d); output.setStem(s); WordEntry cnoun = DictionaryUtil.getCNoun(s); if (cnoun != null) { if (cnoun.getFeature(WordEntry.IDX_NOUN) == '2') output.setCNoun(cnoun.getCompounds()); else output.setCNoun(new ArrayList<CompoundEntry>()); output.setScore(AnalysisOutput.SCORE_CORRECT); } return true; } // public static int endsWithDNoun(String stem) { // for(int i = 0; i < DNouns.length; i++) // if(stem.endsWith(DNouns[i])) // return stem.lastIndexOf(DNouns[i]); // // return -1; // } public static boolean endsWith2Josa(String input) throws MorphException { boolean josaFlag = true; for (int i = input.length() - 2; i > 0; i--) { String josa = input.substring(i); char[] feature = SyllableUtil.getFeature(josa.charAt(0)); if (josaFlag && DictionaryUtil.existJosa(josa)) return true; if (josaFlag && feature[SyllableUtil.IDX_JOSA2] == '0') josaFlag = false; if (!josaFlag) break; } return false; } public static double countFoundNouns(AnalysisOutput o) { if (o.getCNounList().size() == 0) return 0.0; int count = 0; for (final CompoundEntry entry : o.getCNounList()) { if (entry.isExist()) count++; } return (count * 100) / o.getCNounList().size(); } }