/*
* Copyright 2011-2013 the original author or authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.kr.utils;
import org.apache.lucene.analysis.kr.morph.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
public class NounUtil {
private static final Logger log = LoggerFactory.getLogger(NounUtil.class);
private static final boolean isTraceEnabled = log.isTraceEnabled();
private static final boolean isDebugEnabled = log.isDebugEnabled();
private static final List<String> DNouns;
static {
DNouns = new ArrayList<String>();
Collections.addAll(DNouns, "등", "들", "상", "간", "뿐", "별");
}
/**
* 어간부가 음/기 로 끝나는 경우
*
* @throws org.apache.lucene.analysis.kr.morph.MorphException
*
*/
public static boolean analysisMJ(AnalysisOutput o, List<AnalysisOutput> candidates) throws MorphException {
int strlen = o.getStem().length();
if (strlen < 2) return false;
char[] chrs = MorphUtil.decompose(o.getStem().charAt(strlen - 1));
boolean success = false;
if (o.getStem().charAt(strlen - 1) != '기' && !(chrs.length == 3 && chrs[2] == 'ㅁ')) return false;
String start = o.getStem();
String end = "";
if (o.getStem().charAt(strlen - 1) == '기') {
start = o.getStem().substring(0, strlen - 1);
end = "기";
} else if (o.getStem().charAt(strlen - 1) == '음') {
start = o.getStem().substring(0, strlen - 1);
end = "음";
}
String[] eomis = EomiUtil.splitEomi(start, end);
if (eomis[0] == null) return false;
String[] pomis = EomiUtil.splitPomi(eomis[0]);
o.setStem(pomis[0]);
o.addElist(eomis[1]);
o.setPomi(pomis[1]);
try {
if (analysisVMJ(o.clone(), candidates)) return true;
if (analysisVMXMJ(o.clone(), candidates)) return true;
if (analysisNSMJ(o.clone(), candidates)) return true;
} catch (CloneNotSupportedException e) {
throw new MorphException(e.getMessage(), e);
}
if (DictionaryUtil.getVerb(o.getStem()) != null) {
o.setPos(PatternConstants.POS_VERB);
o.setPatn(PatternConstants.PTN_VMJ);
o.setScore(AnalysisOutput.SCORE_CORRECT);
candidates.add(o);
return true;
}
return false;
}
/**
* 용언 + '음/기' + 조사(PTN_VMXMJ)
*
* @throws org.apache.lucene.analysis.kr.morph.MorphException
*
*/
public static boolean analysisVMJ(AnalysisOutput o, List<AnalysisOutput> candidates) throws MorphException {
String[] irrs = IrregularUtil.restoreIrregularVerb(o.getStem(), o.getElist().get(0));
if (irrs != null) {
o.setStem(irrs[0]);
o.setElist(irrs[1], 0);
}
if (DictionaryUtil.getVerb(o.getStem()) != null) {
o.setPatn(PatternConstants.PTN_VMJ);
o.setPos(PatternConstants.POS_VERB);
o.setScore(AnalysisOutput.SCORE_CORRECT);
candidates.add(o);
return true;
}
return false;
}
/**
* 용언 + '아/어' + 보조용언 + '음/기' + 조사(PTN_VMXMJ)
*
* @throws org.apache.lucene.analysis.kr.morph.MorphException
*
*/
public static boolean analysisVMXMJ(AnalysisOutput o, List<AnalysisOutput> candidates) throws MorphException {
int idxXVerb = VerbUtil.endsWithXVerb(o.getStem());
if (idxXVerb != -1) { // 2. 사랑받아보다
String eogan = o.getStem().substring(0, idxXVerb);
o.setXverb(o.getStem().substring(idxXVerb));
String[] stomis = null;
if (eogan.endsWith("아") || eogan.endsWith("어"))
stomis = EomiUtil.splitEomi(eogan.substring(0, eogan.length() - 1), eogan.substring(eogan.length() - 1));
else
stomis = EomiUtil.splitEomi(eogan, "");
if (stomis[0] == null) return false;
String[] irrs = IrregularUtil.restoreIrregularVerb(stomis[0], stomis[1]);
if (irrs != null) {
o.setStem(irrs[0]);
o.addElist(irrs[1]);
} else {
o.setStem(stomis[0]);
o.addElist(stomis[1]);
}
if (DictionaryUtil.getVerb(o.getStem()) != null) {
o.setPatn(PatternConstants.PTN_VMXMJ);
o.setPos(PatternConstants.POS_VERB);
o.setScore(AnalysisOutput.SCORE_CORRECT);
candidates.add(o);
return true;
} else if (analysisNSMXMJ(o, candidates)) {
return true;
}
}
return false;
}
/**
* 체언 + 용언화접미사 + '음/기' + 조사 (PTN_NSMJ)
*
* @throws org.apache.lucene.analysis.kr.morph.MorphException
*
*/
public static boolean analysisNSMJ(AnalysisOutput o, List<AnalysisOutput> candidates) throws MorphException {
int idxVbSfix = VerbUtil.endsWithVerbSuffix(o.getStem());
if (idxVbSfix == -1) return false;
o.setVsfx(o.getStem().substring(idxVbSfix));
o.setStem(o.getStem().substring(0, idxVbSfix));
o.setPatn(PatternConstants.PTN_NSMJ);
o.setPos(PatternConstants.POS_NOUN);
WordEntry entry = DictionaryUtil.getWordExceptVerb(o.getStem());
if (entry != null) {
if (entry.getFeature(WordEntry.IDX_NOUN) == '0') return false;
else if (o.getVsfx().equals("하") && entry.getFeature(WordEntry.IDX_DOV) != '1') return false;
else if (o.getVsfx().equals("되") && entry.getFeature(WordEntry.IDX_BEV) != '1') return false;
else if (o.getVsfx().equals("내") && entry.getFeature(WordEntry.IDX_NE) != '1') return false;
o.setScore(AnalysisOutput.SCORE_CORRECT); // '입니다'인 경우 인명 등 미등록어가 많이 발생되므로 분석성공으로 가정한다.
} else {
o.setScore(AnalysisOutput.SCORE_ANALYSIS); // '입니다'인 경우 인명 등 미등록어가 많이 발생되므로 분석성공으로 가정한다.
}
candidates.add(o);
return true;
}
public static boolean analysisNSMXMJ(AnalysisOutput o, List<AnalysisOutput> candidates) throws MorphException {
int idxVbSfix = VerbUtil.endsWithVerbSuffix(o.getStem());
if (idxVbSfix == -1) return false;
o.setVsfx(o.getStem().substring(idxVbSfix));
o.setStem(o.getStem().substring(0, idxVbSfix));
o.setPatn(PatternConstants.PTN_NSMXMJ);
o.setPos(PatternConstants.POS_NOUN);
WordEntry entry = DictionaryUtil.getWordExceptVerb(o.getStem());
if (entry != null) {
if (entry.getFeature(WordEntry.IDX_NOUN) == '0') return false;
else if (o.getVsfx().equals("하") && entry.getFeature(WordEntry.IDX_DOV) != '1') return false;
else if (o.getVsfx().equals("되") && entry.getFeature(WordEntry.IDX_BEV) != '1') return false;
else if (o.getVsfx().equals("내") && entry.getFeature(WordEntry.IDX_NE) != '1') return false;
o.setScore(AnalysisOutput.SCORE_CORRECT); // '입니다'인 경우 인명 등 미등록어가 많이 발생되므로 분석성공으로 가정한다.
} else {
o.setScore(AnalysisOutput.SCORE_ANALYSIS); // '입니다'인 경우 인명 등 미등록어가 많이 발생되므로 분석성공으로 가정한다.
}
candidates.add(o);
return true;
}
// /**
// * 복합명사인지 조사하고, 복합명사이면 단위명사들을 찾는다.
// * 복합명사인지 여부는 단위명사가 모두 사전에 있는지 여부로 판단한다.
// * 단위명사는 2글자 이상 단어에서만 찾는다.
// * @param o
// * @return
// * @throws org.apache.lucene.analysis.MorphException
// */
// public static boolean confirmCNoun(AnalysisOutput o) throws MorphException {
//
// if(o.getStem().length()<3) return false;
// if(o.getPatn()==PatternConstants.PTN_N
// &&DictionaryUtil.existJosa(o.getStem().substring(o.getStem().length()-2))) return false;
//
// List<CompoundEntry> results = new ArrayList();
// List<List> queue = new ArrayList();
// String prefix = o.getStem().substring(0,1);
//
// int pos = 0;
// boolean moreTwo = false;
// while(pos<o.getStem().length()) {
//
// List<WordEntry> nList = findNouns(o.getStem().substring(pos),queue.size(),o);
// if(nList==null) return false;
//
// if(pos==0&&DictionaryUtil.existPrefix(prefix)) nList.add(new WordEntry(prefix));
//
// if(nList.size()==0) {
// if(queue.size()==0) return false;
// List<WordEntry> tmpList = queue.get(queue.size()-1);
//
// tmpList.remove(tmpList.size()-1);
// pos -= results.get(queue.size()-1).getWord().length();
// if(tmpList.size()==0) {
// while(tmpList.size()==0) {
// results.remove(queue.size()-1);
// queue.remove(tmpList);
// if(queue.size()==0) return false;
//
// tmpList = queue.get(queue.size()-1);
// tmpList.remove(tmpList.size()-1);
// if(tmpList.size()==0) continue;
//
// pos -= results.get(queue.size()-1).getWord().length();
// results.set(queue.size()-1, new CompoundEntry(tmpList.get(tmpList.size()-1).getWord(),pos));
// pos += tmpList.get(tmpList.size()-1).getWord().length();
//
// }
// }else {
// results.set(queue.size()-1, new CompoundEntry(tmpList.get(tmpList.size()-1).getWord(),pos));
// pos += tmpList.get(tmpList.size()-1).getWord().length();
// }
//
// } else {
// queue.add(nList);
// WordEntry noun = nList.get(nList.size()-1);
// results.add(new CompoundEntry(noun.getWord(),pos));
// pos += noun.getWord().length();
// if(noun.getCompounds().size()>0) o.addCNoun(noun.getCompounds());
// if(noun.getWord().length()>1) moreTwo=true;
// }
// }
//
// if(results.size()>1&&DNouns.contains(results.get(results.size()-1).getWord())) {
// CompoundEntry dnoun = results.remove(results.size()-1);
// o.setStem(o.getStem().substring(0,o.getStem().length()-dnoun.getWord().length()));
// o.setNsfx(dnoun.getWord());
// }
//
// if(results.size()>1) o.addCNoun(results);
//
// o.setScore(AnalysisOutput.SCORE_CORRECT);
// return true;
// }
/**
* 복합명사에서 단위명사를 분리해낸다.
* 리스트의 가장 마지막에 위치한 단어가 최장단어이다.
*
* @param str 복합명사
* @param pos
* @param o 분석결과
* @return 단위명사 리스트
* @throws org.apache.lucene.analysis.kr.morph.MorphException
*
*/
private static List findNouns(String str, int pos, AnalysisOutput o) throws MorphException {
if (isTraceEnabled)
log.trace("복합명사에서 단위명사를 분리합니다. str=[{}], pos=[{}]", str, pos);
List<WordEntry> nList = new ArrayList<WordEntry>();
if (str.length() == 2 && DictionaryUtil.existSuffix(str.substring(0, 1)) && DNouns.contains(str.substring(1))) {
o.setStem(o.getStem().substring(0, o.getStem().length() - 1));
o.setNsfx(str.substring(1));
nList.add(new WordEntry(str.substring(0, 1)));
return nList;
} else if (str.length() == 2 && DictionaryUtil.existSuffix(str.substring(0, 1)) && DictionaryUtil.existJosa(str.substring(1))) {
return null;
}
if (pos >= 2 && DictionaryUtil.existJosa(str)) return null;
if (str.length() == 1 && (DictionaryUtil.existSuffix(str) || DNouns.contains(str))) {
nList.add(new WordEntry(str));
return nList;
}
for (int i = 1; i < str.length(); i++) {
String sub = str.substring(0, i + 1);
if (!DictionaryUtil.findWithPrefix(sub).hasNext()) break;
WordEntry entry = DictionaryUtil.getCNoun(sub);
if (entry != null) {
nList.add(entry);
}
}
return nList;
}
/*
* 마지막 음절이 명사형 접미사(등,상..)인지 조사한다.
*/
public static boolean confirmDNoun(AnalysisOutput output) throws MorphException {
int strlen = output.getStem().length();
String d = output.getStem().substring(strlen - 1);
if (!DNouns.contains(d))
return false;
String s = output.getStem().substring(0, strlen - 1);
output.setNsfx(d);
output.setStem(s);
WordEntry cnoun = DictionaryUtil.getCNoun(s);
if (cnoun != null) {
if (cnoun.getFeature(WordEntry.IDX_NOUN) == '2')
output.setCNoun(cnoun.getCompounds());
else
output.setCNoun(new ArrayList<CompoundEntry>());
output.setScore(AnalysisOutput.SCORE_CORRECT);
}
return true;
}
// public static int endsWithDNoun(String stem) {
// for(int i = 0; i < DNouns.length; i++)
// if(stem.endsWith(DNouns[i]))
// return stem.lastIndexOf(DNouns[i]);
//
// return -1;
// }
public static boolean endsWith2Josa(String input) throws MorphException {
boolean josaFlag = true;
for (int i = input.length() - 2; i > 0; i--) {
String josa = input.substring(i);
char[] feature = SyllableUtil.getFeature(josa.charAt(0));
if (josaFlag && DictionaryUtil.existJosa(josa)) return true;
if (josaFlag && feature[SyllableUtil.IDX_JOSA2] == '0')
josaFlag = false;
if (!josaFlag) break;
}
return false;
}
public static double countFoundNouns(AnalysisOutput o) {
if (o.getCNounList().size() == 0)
return 0.0;
int count = 0;
for (final CompoundEntry entry : o.getCNounList()) {
if (entry.isExist())
count++;
}
return (count * 100) / o.getCNounList().size();
}
}