package org.apache.lucene.analysis.kr.morph;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import org.apache.lucene.analysis.kr.utils.ConstraintUtil;
import org.apache.lucene.analysis.kr.utils.DictionaryUtil;
import org.apache.lucene.analysis.kr.utils.EomiUtil;
import org.apache.lucene.analysis.kr.utils.IrregularUtil;
import org.apache.lucene.analysis.kr.utils.KoreanEnv;
import org.apache.lucene.analysis.kr.utils.MorphUtil;
import org.apache.lucene.analysis.kr.utils.NounUtil;
import org.apache.lucene.analysis.kr.utils.SyllableUtil;
import org.apache.lucene.analysis.kr.utils.VerbUtil;
public class MorphAnalyzer {
/**
* starting word of sentence.
*/
public static final int POS_START = 1;
/**
* middle word of sentence
*/
public static final int POS_MID = 2;
/**
* ending word of sentence.
*/
public static final int POS_END = 3;
private CompoundNounAnalyzer cnAnalyzer = new CompoundNounAnalyzer();
public MorphAnalyzer() {
cnAnalyzer.setExactMach(false);
}
public void setExactCompound(boolean is) {
cnAnalyzer.setExactMach(is);
}
public List analyze(String input) throws MorphException {
if(input.endsWith("."))
return analyze(input.substring(0,input.length()-1), POS_END);
return analyze(input, POS_MID);
}
/**
*
* @param input
* @param pos
* @return
* @throws MorphException
*/
public List analyze(String input, int pos) throws MorphException {
List<AnalysisOutput> candidates = new ArrayList();
boolean isVerbOnly = MorphUtil.hasVerbOnly(input);
analysisByRule(input, candidates);
if(!isVerbOnly||candidates.size()==0) addSingleWord(input,candidates);
Collections.sort(candidates,new AnalysisOutputComparator());
// 복합명사 분해여부 결정하여 분해
boolean changed = false;
boolean correct = false;
for(AnalysisOutput o:candidates) {
if(o.getScore()==AnalysisOutput.SCORE_CORRECT) {
if(o.getPatn()!=PatternConstants.PTN_NJ) correct=true;
// "활성화해"가 [활성화(N),하(t),어야(e)] 분석성공하였는데 [활성/화해]분해되는 것을 방지
if(o.getPatn()==PatternConstants.PTN_NSM) break;
continue;
}
if(o.getPatn()<PatternConstants.PTN_VM&&o.getStem().length()>2) {
if(!(correct&&o.getPatn()==PatternConstants.PTN_N)) confirmCNoun(o);
if(o.getScore()>=AnalysisOutput.SCORE_COMPOUNDS) changed=true;
}
}
if(changed) {
Collections.sort(candidates,new AnalysisOutputComparator());
}
List<AnalysisOutput> results = new ArrayList();
boolean hasCorrect = false;
boolean hasCorrectNoun = false;
boolean correctCnoun = false;
HashMap stems = new HashMap();
AnalysisOutput noun = null;
double ratio = 0;
AnalysisOutput compound = null;
for(AnalysisOutput o:candidates) {
if(o.getScore()==AnalysisOutput.SCORE_FAIL) continue; // 분석에는 성공했으나, 제약조건에 실패
if(o.getScore()==AnalysisOutput.SCORE_CORRECT && o.getPos()!=PatternConstants.POS_NOUN ) {
addResults(o,results,stems);
hasCorrect = true;
}else if(o.getPos()==PatternConstants.POS_NOUN&&o.getScore()==AnalysisOutput.SCORE_CORRECT) {
if((hasCorrect||correctCnoun)&&o.getCNounList().size()>0) continue;
if(o.getPos()==PatternConstants.POS_NOUN) {
addResults(o,results,stems);
}else if(noun==null) {
addResults(o,results,stems);
noun = o;
} else if(o.getPatn()==PatternConstants.PTN_N||(o.getPatn()>noun.getPatn())||
(o.getPatn()==noun.getPatn()&&
o.getJosa()!=null&&noun.getJosa()!=null&&o.getJosa().length()>noun.getJosa().length())) {
results.remove(noun);
addResults(o,results,stems);
noun = o;
}
hasCorrectNoun=true;
// if(o.getCNounList().size()>0) correctCnoun = true;
}else if(o.getPos()==PatternConstants.POS_NOUN&&o.getCNounList().size()>0&&!hasCorrect&&!hasCorrectNoun) {
double curatio = NounUtil.countFoundNouns(o);
if(ratio<curatio&&(compound==null||(compound!=null&&compound.getJosa()==null))) {
ratio = curatio;
compound = o;
}
}else if(o.getPos()==PatternConstants.POS_NOUN&&!hasCorrect&&!hasCorrectNoun&&compound==null) {
addResults(o,results,stems);
}else if(o.getPatn()==PatternConstants.PTN_NSM) {
addResults(o,results,stems);
}
}
if(compound!=null) addResults(compound,results,stems);
if(results.size()==0) {
AnalysisOutput output = new AnalysisOutput(input, null, null, PatternConstants.PTN_N, AnalysisOutput.SCORE_ANALYSIS);
output.setPos(PatternConstants.POS_NOUN);
results.add(output);
}
return results;
}
private void analysisByRule(String input, List candidates) throws MorphException {
boolean josaFlag = true;
boolean eomiFlag = true;
int strlen = input.length();
// boolean isVerbOnly = MorphUtil.hasVerbOnly(input);
boolean isVerbOnly = false;
analysisWithEomi(input,"",candidates);
for(int i=strlen-1;i>0;i--) {
String stem = input.substring(0,i);
String eomi = input.substring(i);
char[] feature = SyllableUtil.getFeature(eomi.charAt(0));
if(!isVerbOnly&&josaFlag&&feature[SyllableUtil.IDX_JOSA1]=='1') {
analysisWithJosa(stem,eomi,candidates);
}
if(eomiFlag) {
analysisWithEomi(stem,eomi,candidates);
}
if(josaFlag&&feature[SyllableUtil.IDX_JOSA2]=='0') josaFlag = false;
if(eomiFlag&&feature[SyllableUtil.IDX_EOMI2]=='0') eomiFlag = false;
if(!josaFlag&&!eomiFlag) break;
}
}
private void addResults(AnalysisOutput o, List results, HashMap<String, AnalysisOutput> stems) {
AnalysisOutput old = stems.get(o.getStem());
if(old==null||old.getPos()!=o.getPos()) {
results.add(o);
stems.put(o.getStem(), o);
}else if(old.getPatn()<o.getPatn()) {
results.remove(old);
results.add(o);
stems.put(o.getStem(), o);
}
}
private void addSingleWord(String word, List<AnalysisOutput> candidates) throws MorphException {
// if(candidates.size()!=0&&candidates.get(0).getScore()==AnalysisOutput.SCORE_CORRECT) return;
AnalysisOutput output = new AnalysisOutput(word, null, null, PatternConstants.PTN_N);
output.setPos(PatternConstants.POS_NOUN);
WordEntry entry;
if((entry=DictionaryUtil.getWord(word))!=null) {
if(entry.getFeature(WordEntry.IDX_NOUN)!='1'&&
entry.getFeature(WordEntry.IDX_BUSA)=='1') {
AnalysisOutput busa = new AnalysisOutput(word, null, null, PatternConstants.PTN_AID);
busa.setPos(PatternConstants.POS_ETC);
busa.setScore(AnalysisOutput.SCORE_CORRECT);
candidates.add(0,busa);
}else if(entry.getFeature(WordEntry.IDX_NOUN)=='1') {
output.setScore(AnalysisOutput.SCORE_CORRECT);
candidates.add(0,output);
}else if(entry.getFeature(WordEntry.IDX_NOUN)=='2') {
candidates.add(0,output);
}
if(entry.getFeature(WordEntry.IDX_VERB)!='1') return;
} else if(candidates.size()==0||!NounUtil.endsWith2Josa(word)) {
output.setScore(AnalysisOutput.SCORE_ANALYSIS);
candidates.add(0,output);
}
}
/**
* 체언 + 조사 (PTN_NJ)
* 체언 + 용언화접미사 + '음/기' + 조사 (PTN_NSMJ
* 용언 + '음/기' + 조사 (PTN_VMJ)
* 용언 + '아/어' + 보조용언 + '음/기' + 조사(PTN_VMXMJ)
*
* @param stem
* @param end
* @param candidates
* @throws MorphException
*/
public void analysisWithJosa(String stem, String end, List candidates) throws MorphException {
if(stem==null||stem.length()==0) return;
char[] chrs = MorphUtil.decompose(stem.charAt(stem.length()-1));
if(!DictionaryUtil.existJosa(end)||
(chrs.length==3&&ConstraintUtil.isTwoJosa(end))||
(chrs.length==2&&(ConstraintUtil.isThreeJosa(end))||"".equals(end))) return; // 연결이 가능한 조사가 아니면...
AnalysisOutput output = new AnalysisOutput(stem, end, null, PatternConstants.PTN_NJ);
output.setPos(PatternConstants.POS_NOUN);
boolean success = false;
try {
success = NounUtil.analysisMJ(output.clone(), candidates);
} catch (CloneNotSupportedException e) {
throw new MorphException(e.getMessage(),e);
}
WordEntry entry = DictionaryUtil.getWordExceptVerb(stem);
if(entry!=null) {
output.setScore(AnalysisOutput.SCORE_CORRECT);
if(entry.getFeature(WordEntry.IDX_NOUN)=='0'&&entry.getFeature(WordEntry.IDX_BUSA)=='1') {
output.setPos(PatternConstants.POS_ETC);
output.setPatn(PatternConstants.PTN_ADVJ);
}
}else {
if(MorphUtil.hasVerbOnly(stem)) return;
}
candidates.add(output);
}
/**
*
* 1. 사랑받다 : 체언 + 용언화접미사 + 어미 (PTN_NSM) <br>
* 2. 사랑받아보다 : 체언 + 용언화접미사 + '아/어' + 보조용언 + 어미 (PTN_NSMXM) <br>
* 3. 학교에서이다 : 체언 + '에서/부터/에서부터' + '이' + 어미 (PTN_NJCM) <br>
* 4. 돕다 : 용언 + 어미 (PTN_VM) <br>
* 5. 도움이다 : 용언 + '음/기' + '이' + 어미 (PTN_VMCM) <br>
* 6. 도와주다 : 용언 + '아/어' + 보조용언 + 어미 (PTN_VMXM) <br>
*
* @param stem
* @param end
* @param candidates
* @throws CloneNotSupportedException
*/
public void analysisWithEomi(String stem, String end, List candidates) throws MorphException {
String[] morphs = EomiUtil.splitEomi(stem, end);
if(morphs[0]==null) return; // 어미가 사전에 등록되어 있지 않다면....
String[] pomis = EomiUtil.splitPomi(morphs[0]);
AnalysisOutput o = new AnalysisOutput(pomis[0],null,morphs[1],PatternConstants.PTN_VM);
o.setPomi(pomis[1]);
try {
WordEntry entry = DictionaryUtil.getVerb(o.getStem());
if(entry!=null&&!("을".equals(end)&&entry.getFeature(WordEntry.IDX_REGURA)==IrregularUtil.IRR_TYPE_LIUL)) {
AnalysisOutput output = o.clone();
output.setScore(AnalysisOutput.SCORE_CORRECT);
MorphUtil.buildPtnVM(output, candidates);
char[] features = SyllableUtil.getFeature(stem.charAt(stem.length()-1)); // ㄹ불규칙일 경우
if(features[SyllableUtil.IDX_YNPLN]=='0'||morphs[1].charAt(0)!='ㄴ') return;
}
String[] irrs = IrregularUtil.restoreIrregularVerb(o.getStem(), o.getPomi()==null?o.getEomi():o.getPomi());
if(irrs!=null) { // 불규칙동사인 경우
AnalysisOutput output = o.clone();
output.setStem(irrs[0]);
if(output.getPomi()==null)
output.setEomi(irrs[1]);
else
output.setPomi(irrs[1]);
// entry = DictionaryUtil.getVerb(output.getStem());
// if(entry!=null && VerbUtil.constraintVerb(o.getStem(), o.getPomi()==null?o.getEomi():o.getPomi())) { // 4. 돕다 (PTN_VM)
output.setScore(AnalysisOutput.SCORE_CORRECT);
MorphUtil.buildPtnVM(output, candidates);
// }
}
if(VerbUtil.ananlysisNSM(o.clone(), candidates)) return;
if(VerbUtil.ananlysisNSMXM(o.clone(), candidates)) return;
// [체언 + '에서/에서부터' + '이' + 어미]
if(VerbUtil.ananlysisNJCM(o.clone(),candidates)) return;
if(VerbUtil.analysisVMCM(o.clone(),candidates)) return;
VerbUtil.analysisVMXM(o.clone(), candidates);
} catch (CloneNotSupportedException e) {
throw new MorphException(e.getMessage(),e);
}
}
public void analysisCNoun(List<AnalysisOutput> candidates) throws MorphException {
boolean success = false;
for(AnalysisOutput o: candidates) {
if(o.getPos()!=PatternConstants.POS_NOUN) continue;
if(o.getScore()==AnalysisOutput.SCORE_CORRECT)
success=true;
else if(!success)
confirmCNoun(o);
}
}
/**
* 복합명사인지 조사하고, 복합명사이면 단위명사들을 찾는다.
* 복합명사인지 여부는 단위명사가 모두 사전에 있는지 여부로 판단한다.
* 단위명사는 2글자 이상 단어에서만 찾는다.
* @param o
* @return
* @throws MorphException
*/
public boolean confirmCNoun(AnalysisOutput o) throws MorphException {
if(o.getStem().length()<3) return false;
WordEntry cnoun = DictionaryUtil.getCNoun(o.getStem());
if(cnoun!=null && cnoun.getFeature(WordEntry.IDX_NOUN)=='2') {
o.addCNoun(cnoun.getCompounds());
o.setScore(AnalysisOutput.SCORE_CORRECT);
return true;
}
List<CompoundEntry> results = cnAnalyzer.analyze(o.getStem());
// System.out.println(o);
// for(CompoundEntry c :results) System.out.println(c.getWord()+":"+c.isExist());
boolean success = false;
if(results.size()>1) {
o.setCNoun(results);
success = true;
for(CompoundEntry entry : results) {
if(!entry.isExist()) success = false;
}
o.setScore(AnalysisOutput.SCORE_COMPOUNDS);
}
if(success) {
if(constraint(o)) {
o.setScore(AnalysisOutput.SCORE_CORRECT);
} else {
o.setScore(AnalysisOutput.SCORE_FAIL);
return false;
}
} else {
if(NounUtil.confirmDNoun(o)&&o.getScore()!=AnalysisOutput.SCORE_CORRECT) {
confirmCNoun(o);
}
if(o.getScore()==AnalysisOutput.SCORE_CORRECT) success = true;
if(o.getCNounList().size()>0&&!constraint(o)) o.setScore(AnalysisOutput.SCORE_FAIL);
}
return success;
}
private boolean constraint(AnalysisOutput o) throws MorphException {
List<CompoundEntry> cnouns = o.getCNounList();
if("화해".equals(cnouns.get(cnouns.size()-1).getWord())) {
if(!ConstraintUtil.canHaheCompound(cnouns.get(cnouns.size()-2).getWord())) return false;
}else if(o.getPatn()==PatternConstants.PTN_NSM) {
if("내".equals(o.getVsfx())&&cnouns.get(cnouns.size()-1).getWord().length()!=1) {
WordEntry entry = DictionaryUtil.getWord(cnouns.get(cnouns.size()-1).getWord());
if(entry!=null&&entry.getFeature(WordEntry.IDX_NE)=='0') return false;
}else if("하".equals(o.getVsfx())&&cnouns.get(cnouns.size()-1).getWord().length()==1) {
// 짝사랑하다 와 같은 경우에 뒷글자가 1글자이면 제외
return false;
}
}
return true;
}
}