package org.apache.lucene.analysis.kr.utils; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.util.ArrayList; import java.util.HashMap; import java.util.List; import org.apache.lucene.analysis.kr.morph.AnalysisOutput; import org.apache.lucene.analysis.kr.morph.CompoundEntry; import org.apache.lucene.analysis.kr.morph.CompoundNounAnalyzer; import org.apache.lucene.analysis.kr.morph.MorphException; import org.apache.lucene.analysis.kr.morph.PatternConstants; import org.apache.lucene.analysis.kr.morph.WordEntry; public class NounUtil { private static final List DNouns = new ArrayList(); static { String[] strs = new String[]{"등", "들","상","간","뿐","별"}; for(String str:strs) { DNouns.add(str); } }; /** * * 어간부가 음/기 로 끝나는 경우 * * @param o * @param candidates * @return * @throws MorphException */ public static boolean analysisMJ(AnalysisOutput o, List candidates) throws MorphException { int strlen = o.getStem().length(); if(strlen<2) return false; char[] chrs = MorphUtil.decompose(o.getStem().charAt(strlen-1)); boolean success = false; if(o.getStem().charAt(strlen-1)!='기'&&!(chrs.length==3&&chrs[2]=='ㅁ')) return false; String start = o.getStem(); String end = ""; if(o.getStem().charAt(strlen-1)=='기') { start = o.getStem().substring(0,strlen-1); end = "기"; }else if(o.getStem().charAt(strlen-1)=='음') { start = o.getStem().substring(0,strlen-1); end = "음"; } String[] eomis = EomiUtil.splitEomi(start, end); if(eomis[0]==null) return false; String[] pomis = EomiUtil.splitPomi(eomis[0]); o.setStem(pomis[0]); o.addElist(eomis[1]); o.setPomi(pomis[1]); try { if(analysisVMJ(o.clone(),candidates)) return true; if(analysisVMXMJ(o.clone(),candidates)) return true; if(analysisNSMJ(o.clone(),candidates)) return true; } catch (CloneNotSupportedException e) { throw new MorphException(e.getMessage(),e); } if(DictionaryUtil.getVerb(o.getStem())!=null) { o.setPos(PatternConstants.POS_VERB); o.setPatn(PatternConstants.PTN_VMJ); o.setScore(AnalysisOutput.SCORE_CORRECT); candidates.add(o); return true; } return false; } /** * 용언 + '음/기' + 조사(PTN_VMXMJ) * @param o * @param candidates * @return * @throws MorphException */ public static boolean analysisVMJ(AnalysisOutput o, List candidates) throws MorphException { String[] irrs = IrregularUtil.restoreIrregularVerb(o.getStem(), o.getElist().get(0)); if(irrs!=null) { o.setStem(irrs[0]); o.setElist(irrs[1],0); } if(DictionaryUtil.getVerb(o.getStem())!=null) { o.setPatn(PatternConstants.PTN_VMJ); o.setPos(PatternConstants.POS_VERB); o.setScore(AnalysisOutput.SCORE_CORRECT); candidates.add(o); return true; } return false; } /** * 용언 + '아/어' + 보조용언 + '음/기' + 조사(PTN_VMXMJ) * @param o * @param candidates * @return * @throws MorphException */ public static boolean analysisVMXMJ(AnalysisOutput o, List candidates) throws MorphException { int idxXVerb = VerbUtil.endsWithXVerb(o.getStem()); if(idxXVerb!=-1) { // 2. 사랑받아보다 String eogan = o.getStem().substring(0,idxXVerb); o.setXverb(o.getStem().substring(idxXVerb)); String[] stomis = null; if(eogan.endsWith("아")||eogan.endsWith("어")) stomis = EomiUtil.splitEomi(eogan.substring(0,eogan.length()-1),eogan.substring(eogan.length()-1)); else stomis = EomiUtil.splitEomi(eogan,""); if(stomis[0]==null) return false; String[] irrs = IrregularUtil.restoreIrregularVerb(stomis[0], stomis[1]); if(irrs!=null) { o.setStem(irrs[0]); o.addElist(irrs[1]); }else { o.setStem(stomis[0]); o.addElist(stomis[1]); } if(DictionaryUtil.getVerb(o.getStem())!=null) { o.setPatn(PatternConstants.PTN_VMXMJ); o.setPos(PatternConstants.POS_VERB); o.setScore(AnalysisOutput.SCORE_CORRECT); candidates.add(o); return true; }else if(analysisNSMXMJ(o, candidates)){ return true; } } return false; } /** * 체언 + 용언화접미사 + '음/기' + 조사 (PTN_NSMJ) * @param o * @param candidates * @return * @throws MorphException */ public static boolean analysisNSMJ(AnalysisOutput o, List candidates) throws MorphException { int idxVbSfix = VerbUtil.endsWithVerbSuffix(o.getStem()); if(idxVbSfix==-1) return false; o.setVsfx(o.getStem().substring(idxVbSfix)); o.setStem(o.getStem().substring(0,idxVbSfix)); o.setPatn(PatternConstants.PTN_NSMJ); o.setPos(PatternConstants.POS_NOUN); WordEntry entry = DictionaryUtil.getWordExceptVerb(o.getStem()); if(entry!=null) { if(entry.getFeature(WordEntry.IDX_NOUN)=='0') return false; else if(o.getVsfx().equals("하")&&entry.getFeature(WordEntry.IDX_DOV)!='1') return false; else if(o.getVsfx().equals("되")&&entry.getFeature(WordEntry.IDX_BEV)!='1') return false; else if(o.getVsfx().equals("내")&&entry.getFeature(WordEntry.IDX_NE)!='1') return false; o.setScore(AnalysisOutput.SCORE_CORRECT); // '입니다'인 경우 인명 등 미등록어가 많이 발생되므로 분석성공으로 가정한다. }else { o.setScore(AnalysisOutput.SCORE_ANALYSIS); // '입니다'인 경우 인명 등 미등록어가 많이 발생되므로 분석성공으로 가정한다. } candidates.add(o); return true; } public static boolean analysisNSMXMJ(AnalysisOutput o, List candidates) throws MorphException { int idxVbSfix = VerbUtil.endsWithVerbSuffix(o.getStem()); if(idxVbSfix==-1) return false; o.setVsfx(o.getStem().substring(idxVbSfix)); o.setStem(o.getStem().substring(0,idxVbSfix)); o.setPatn(PatternConstants.PTN_NSMXMJ); o.setPos(PatternConstants.POS_NOUN); WordEntry entry = DictionaryUtil.getWordExceptVerb(o.getStem()); if(entry!=null) { if(entry.getFeature(WordEntry.IDX_NOUN)=='0') return false; else if(o.getVsfx().equals("하")&&entry.getFeature(WordEntry.IDX_DOV)!='1') return false; else if(o.getVsfx().equals("되")&&entry.getFeature(WordEntry.IDX_BEV)!='1') return false; else if(o.getVsfx().equals("내")&&entry.getFeature(WordEntry.IDX_NE)!='1') return false; o.setScore(AnalysisOutput.SCORE_CORRECT); // '입니다'인 경우 인명 등 미등록어가 많이 발생되므로 분석성공으로 가정한다. }else { o.setScore(AnalysisOutput.SCORE_ANALYSIS); // '입니다'인 경우 인명 등 미등록어가 많이 발생되므로 분석성공으로 가정한다. } candidates.add(o); return true; } /** * 복합명사인지 조사하고, 복합명사이면 단위명사들을 찾는다. * 복합명사인지 여부는 단위명사가 모두 사전에 있는지 여부로 판단한다. * 단위명사는 2글자 이상 단어에서만 찾는다. * @param o * @return * @throws MorphException */ // public static boolean confirmCNoun(AnalysisOutput o) throws MorphException { // // if(o.getStem().length()<3) return false; // if(o.getPatn()==PatternConstants.PTN_N // &&DictionaryUtil.existJosa(o.getStem().substring(o.getStem().length()-2))) return false; // // List<CompoundEntry> results = new ArrayList(); // List<List> queue = new ArrayList(); // String prefix = o.getStem().substring(0,1); // // int pos = 0; // boolean moreTwo = false; // while(pos<o.getStem().length()) { // // List<WordEntry> nList = findNouns(o.getStem().substring(pos),queue.size(),o); // if(nList==null) return false; // // if(pos==0&&DictionaryUtil.existPrefix(prefix)) nList.add(new WordEntry(prefix)); // // if(nList.size()==0) { // if(queue.size()==0) return false; // List<WordEntry> tmpList = queue.get(queue.size()-1); // // tmpList.remove(tmpList.size()-1); // pos -= results.get(queue.size()-1).getWord().length(); // if(tmpList.size()==0) { // while(tmpList.size()==0) { // results.remove(queue.size()-1); // queue.remove(tmpList); // if(queue.size()==0) return false; // // tmpList = queue.get(queue.size()-1); // tmpList.remove(tmpList.size()-1); // if(tmpList.size()==0) continue; // // pos -= results.get(queue.size()-1).getWord().length(); // results.set(queue.size()-1, new CompoundEntry(tmpList.get(tmpList.size()-1).getWord(),pos)); // pos += tmpList.get(tmpList.size()-1).getWord().length(); // // } // }else { // results.set(queue.size()-1, new CompoundEntry(tmpList.get(tmpList.size()-1).getWord(),pos)); // pos += tmpList.get(tmpList.size()-1).getWord().length(); // } // // } else { // queue.add(nList); // WordEntry noun = nList.get(nList.size()-1); // results.add(new CompoundEntry(noun.getWord(),pos)); // pos += noun.getWord().length(); // if(noun.getCompounds().size()>0) o.addCNoun(noun.getCompounds()); // if(noun.getWord().length()>1) moreTwo=true; // } // } // // if(results.size()>1&&DNouns.contains(results.get(results.size()-1).getWord())) { // CompoundEntry dnoun = results.remove(results.size()-1); // o.setStem(o.getStem().substring(0,o.getStem().length()-dnoun.getWord().length())); // o.setNsfx(dnoun.getWord()); // } // // if(results.size()>1) o.addCNoun(results); // // o.setScore(AnalysisOutput.SCORE_CORRECT); // return true; // } /** * 복합명사에서 단위명사를 분리해낸다. * 리스트의 가장 마지막에 위치한 단어가 최장단어이다. * @param str 복합명사 * @param pos * @param o 분석결과 * @return 단위명사 리스트 * @throws MorphException */ private static List findNouns(String str, int pos, AnalysisOutput o) throws MorphException { List<WordEntry> nList = new ArrayList(); if(str.length()==2&&DictionaryUtil.existSuffix(str.substring(0,1))&&DNouns.contains(str.substring(1))) { o.setStem(o.getStem().substring(0,o.getStem().length()-1)); o.setNsfx(str.substring(1)); nList.add(new WordEntry(str.substring(0,1))); return nList; }else if(str.length()==2&&DictionaryUtil.existSuffix(str.substring(0,1))&&DictionaryUtil.existJosa(str.substring(1))) { return null; } if(pos>=2&&DictionaryUtil.existJosa(str)) return null; if(str.length()==1&&(DictionaryUtil.existSuffix(str)||DNouns.contains(str))) { nList.add(new WordEntry(str)); return nList; } for(int i=1;i<str.length();i++) { String sub = str.substring(0,i+1); if(!DictionaryUtil.findWithPrefix(sub).hasNext()) break; WordEntry entry = DictionaryUtil.getCNoun(sub); if(entry!=null) { nList.add(entry); } } return nList; } /* * 마지막 음절이 명사형 접미사(등,상..)인지 조사한다. */ public static boolean confirmDNoun(AnalysisOutput output) throws MorphException { int strlen = output.getStem().length(); String d = output.getStem().substring(strlen-1); if(!DNouns.contains(d)) return false; String s = output.getStem().substring(0, strlen-1); output.setNsfx(d); output.setStem(s); WordEntry cnoun = DictionaryUtil.getCNoun(s); if(cnoun != null) { if(cnoun.getFeature(WordEntry.IDX_NOUN)=='2') output.setCNoun(cnoun.getCompounds()); else output.setCNoun(new ArrayList()); output.setScore(AnalysisOutput.SCORE_CORRECT); } return true; } // public static int endsWithDNoun(String stem) { // for(int i = 0; i < DNouns.length; i++) // if(stem.endsWith(DNouns[i])) // return stem.lastIndexOf(DNouns[i]); // // return -1; // } public static boolean endsWith2Josa(String input) throws MorphException { boolean josaFlag = true; for(int i=input.length()-2;i>0;i--) { String josa = input.substring(i); char[] feature = SyllableUtil.getFeature(josa.charAt(0)); if(josaFlag&&DictionaryUtil.existJosa(josa)) return true; if(josaFlag&&feature[SyllableUtil.IDX_JOSA2]=='0') josaFlag = false; if(!josaFlag) break; } return false; } public static double countFoundNouns(AnalysisOutput o) { int count = 0; for(int i=0;i<o.getCNounList().size();i++) { if(o.getCNounList().get(i).isExist()) count++; } return (count*100)/o.getCNounList().size(); } }