/*
* Copyright 2011-2013 the original author or authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.kr.morph;
import org.apache.lucene.analysis.kr.utils.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
public class MorphAnalyzer {
private static final Logger log = LoggerFactory.getLogger(MorphAnalyzer.class);
private static final boolean isTraceEnabled = log.isTraceEnabled();
private static final boolean isDebugEnabled = log.isDebugEnabled();
/** starting word of sentence. */
public static final int POS_START = 1;
/** middle word of sentence */
public static final int POS_MID = 2;
/** ending word of sentence. */
public static final int POS_END = 3;
private CompoundNounAnalyzer cnAnalyzer = new CompoundNounAnalyzer();
public MorphAnalyzer() {
cnAnalyzer.setExactMach(false);
}
public final void setExactCompound(boolean is) {
cnAnalyzer.setExactMach(is);
}
public final List<AnalysisOutput> analyze(String input) throws MorphException {
if (input.endsWith("."))
return analyze(input.substring(0, input.length() - 1), POS_END);
return analyze(input, POS_MID);
}
@SuppressWarnings( "unchecked" )
public List<AnalysisOutput> analyze(String input, int pos) throws MorphException {
if (isTraceEnabled)
log.trace("analyze input=[{}], pos=[{}]", input, pos);
List<AnalysisOutput> candidates = new ArrayList<AnalysisOutput>();
boolean isVerbOnly = MorphUtil.hasVerbOnly(input);
analysisByRule(input, candidates);
if (!isVerbOnly || candidates.size() == 0)
addSingleWord(input, candidates);
Collections.sort(candidates, new AnalysisOutputComparator());
// 복합명사 분해여부 결정하여 분해
boolean changed = false;
boolean correct = false;
for (AnalysisOutput o : candidates) {
if (o.getScore() == AnalysisOutput.SCORE_CORRECT) {
if (o.getPatn() != PatternConstants.PTN_NJ) correct = true;
// "활성화해"가 [활성화(N),하(t),어야(e)] 분석성공하였는데 [활성/화해]분해되는 것을 방지
if (o.getPatn() == PatternConstants.PTN_NSM) break;
continue;
}
if (o.getPatn() < PatternConstants.PTN_VM && o.getStem().length() > 2) {
if (!(correct && o.getPatn() == PatternConstants.PTN_N)) confirmCNoun(o);
if (o.getScore() >= AnalysisOutput.SCORE_COMPOUNDS) changed = true;
}
}
if (changed) {
Collections.sort(candidates, new AnalysisOutputComparator());
}
List<AnalysisOutput> results = new ArrayList<AnalysisOutput>();
boolean hasCorrect = false;
boolean hasCorrectNoun = false;
boolean correctCnoun = false;
HashMap<String, AnalysisOutput> stems = new HashMap<String, AnalysisOutput>();
AnalysisOutput noun = null;
double ratio = 0;
AnalysisOutput compound = null;
for (AnalysisOutput o : candidates) {
if (o.getScore() == AnalysisOutput.SCORE_FAIL) continue; // 분석에는 성공했으나, 제약조건에 실패
if (o.getScore() == AnalysisOutput.SCORE_CORRECT && o.getPos() != PatternConstants.POS_NOUN) {
addResults(o, results, stems);
hasCorrect = true;
} else if (o.getPos() == PatternConstants.POS_NOUN && o.getScore() == AnalysisOutput.SCORE_CORRECT) {
if ((hasCorrect || correctCnoun) && o.getCNounList().size() > 0) continue;
if (o.getPos() == PatternConstants.POS_NOUN) {
addResults(o, results, stems);
} else if (noun == null) {
addResults(o, results, stems);
noun = o;
} else if (o.getPatn() == PatternConstants.PTN_N || (o.getPatn() > noun.getPatn()) ||
(o.getPatn() == noun.getPatn() &&
o.getJosa() != null && noun.getJosa() != null && o.getJosa().length() > noun.getJosa().length())) {
results.remove(noun);
addResults(o, results, stems);
noun = o;
}
hasCorrectNoun = true;
// if(o.getCNounList().size()>0) correctCnoun = true;
} else if (o.getPos() == PatternConstants.POS_NOUN && o.getCNounList().size() > 0 && !hasCorrect && !hasCorrectNoun) {
double curatio = NounUtil.countFoundNouns(o);
if (ratio < curatio && (compound == null || (compound != null && compound.getJosa() == null))) {
ratio = curatio;
compound = o;
}
} else if (o.getPos() == PatternConstants.POS_NOUN && !hasCorrect && !hasCorrectNoun && compound == null) {
addResults(o, results, stems);
} else if (o.getPatn() == PatternConstants.PTN_NSM) {
addResults(o, results, stems);
}
}
if (compound != null) addResults(compound, results, stems);
if (results.size() == 0) {
AnalysisOutput output = new AnalysisOutput(input, null, null, PatternConstants.PTN_N, AnalysisOutput.SCORE_ANALYSIS);
output.setPos(PatternConstants.POS_NOUN);
results.add(output);
}
return results;
}
private void analysisByRule(String input, List<AnalysisOutput> candidates) throws MorphException {
boolean josaFlag = true;
boolean eomiFlag = true;
int strlen = input.length();
// boolean isVerbOnly = MorphUtil.hasVerbOnly(input);
boolean isVerbOnly = false;
analysisWithEomi(input, "", candidates);
for (int i = strlen - 1; i > 0; i--) {
String stem = input.substring(0, i);
String eomi = input.substring(i);
char[] feature = SyllableUtil.getFeature(eomi.charAt(0));
if (!isVerbOnly && josaFlag && feature[SyllableUtil.IDX_JOSA1] == '1') {
analysisWithJosa(stem, eomi, candidates);
}
if (eomiFlag) {
analysisWithEomi(stem, eomi, candidates);
}
if (josaFlag && feature[SyllableUtil.IDX_JOSA2] == '0') josaFlag = false;
if (eomiFlag && feature[SyllableUtil.IDX_EOMI2] == '0') eomiFlag = false;
if (!josaFlag && !eomiFlag) break;
}
}
private void addResults(AnalysisOutput o, List<AnalysisOutput> results, HashMap<String, AnalysisOutput> stems) {
AnalysisOutput old = stems.get(o.getStem());
if (old == null || old.getPos() != o.getPos()) {
results.add(o);
stems.put(o.getStem(), o);
} else if (old.getPatn() < o.getPatn()) {
results.remove(old);
results.add(o);
stems.put(o.getStem(), o);
}
}
private void addSingleWord(String word, List<AnalysisOutput> candidates) throws MorphException {
// if(candidates.size()!=0&&candidates.get(0).getScore()==AnalysisOutput.SCORE_CORRECT) return;
AnalysisOutput output = new AnalysisOutput(word, null, null, PatternConstants.PTN_N);
output.setPos(PatternConstants.POS_NOUN);
WordEntry entry;
if ((entry = DictionaryUtil.getWord(word)) != null) {
if (entry.getFeature(WordEntry.IDX_NOUN) != '1' &&
entry.getFeature(WordEntry.IDX_BUSA) == '1') {
AnalysisOutput busa = new AnalysisOutput(word, null, null, PatternConstants.PTN_AID);
busa.setPos(PatternConstants.POS_ETC);
busa.setScore(AnalysisOutput.SCORE_CORRECT);
candidates.add(0, busa);
} else if (entry.getFeature(WordEntry.IDX_NOUN) == '1') {
output.setScore(AnalysisOutput.SCORE_CORRECT);
candidates.add(0, output);
} else if (entry.getFeature(WordEntry.IDX_NOUN) == '2') {
candidates.add(0, output);
}
if (entry.getFeature(WordEntry.IDX_VERB) != '1')
return;
} else if (candidates.size() == 0 || !NounUtil.endsWith2Josa(word)) {
output.setScore(AnalysisOutput.SCORE_ANALYSIS);
candidates.add(0, output);
}
}
/**
* 체언 + 조사 (PTN_NJ)
* 체언 + 용언화접미사 + '음/기' + 조사 (PTN_NSMJ
* 용언 + '음/기' + 조사 (PTN_VMJ)
* 용언 + '아/어' + 보조용언 + '음/기' + 조사(PTN_VMXMJ)
*
* @throws org.apache.lucene.analysis.kr.morph.MorphException
*
*/
public void analysisWithJosa(String stem, String end, List<AnalysisOutput> candidates) throws MorphException {
if (isTraceEnabled)
log.trace("조사를 분석합니다. stem=[{}], end=[{}]", stem, end);
if (stem == null || stem.length() == 0) return;
char[] chrs = MorphUtil.decompose(stem.charAt(stem.length() - 1));
if (!DictionaryUtil.existJosa(end) ||
(chrs.length == 3 && ConstraintUtil.isTwoJosa(end)) ||
(chrs.length == 2 && (ConstraintUtil.isThreeJosa(end)) || "".equals(end))) return; // 연결이 가능한 조사가 아니면...
AnalysisOutput output = new AnalysisOutput(stem, end, null, PatternConstants.PTN_NJ);
output.setPos(PatternConstants.POS_NOUN);
boolean success = false;
try {
success = NounUtil.analysisMJ(output.clone(), candidates);
} catch (CloneNotSupportedException e) {
throw new MorphException(e.getMessage(), e);
}
WordEntry entry = DictionaryUtil.getWordExceptVerb(stem);
if (entry != null) {
output.setScore(AnalysisOutput.SCORE_CORRECT);
if (entry.getFeature(WordEntry.IDX_NOUN) == '0' && entry.getFeature(WordEntry.IDX_BUSA) == '1') {
output.setPos(PatternConstants.POS_ETC);
output.setPatn(PatternConstants.PTN_ADVJ);
}
} else {
if (MorphUtil.hasVerbOnly(stem)) return;
}
candidates.add(output);
}
/**
* 1. 사랑받다 : 체언 + 용언화접미사 + 어미 (PTN_NSM) <br>
* 2. 사랑받아보다 : 체언 + 용언화접미사 + '아/어' + 보조용언 + 어미 (PTN_NSMXM) <br>
* 3. 학교에서이다 : 체언 + '에서/부터/에서부터' + '이' + 어미 (PTN_NJCM) <br>
* 4. 돕다 : 용언 + 어미 (PTN_VM) <br>
* 5. 도움이다 : 용언 + '음/기' + '이' + 어미 (PTN_VMCM) <br>
* 6. 도와주다 : 용언 + '아/어' + 보조용언 + 어미 (PTN_VMXM) <br>
*/
public void analysisWithEomi(String stem, String end, List<AnalysisOutput> candidates) throws MorphException {
if (isTraceEnabled)
log.trace("조사를 분석합니다. stem=[{}], end=[{}]", stem, end);
String[] morphs = EomiUtil.splitEomi(stem, end);
if (morphs[0] == null) return; // 어미가 사전에 등록되어 있지 않다면....
String[] pomis = EomiUtil.splitPomi(morphs[0]);
AnalysisOutput o = new AnalysisOutput(pomis[0], null, morphs[1], PatternConstants.PTN_VM);
o.setPomi(pomis[1]);
try {
WordEntry entry = DictionaryUtil.getVerb(o.getStem());
if (entry != null && !("을".equals(end) && entry.getFeature(WordEntry.IDX_REGURA) == IrregularUtil.IRR_TYPE_LIUL)) {
AnalysisOutput output = o.clone();
output.setScore(AnalysisOutput.SCORE_CORRECT);
MorphUtil.buildPtnVM(output, candidates);
char[] features = SyllableUtil.getFeature(stem.charAt(stem.length() - 1)); // ㄹ불규칙일 경우
if (features[SyllableUtil.IDX_YNPLN] == '0' || morphs[1].charAt(0) != 'ㄴ') return;
}
String[] irrs = IrregularUtil.restoreIrregularVerb(o.getStem(), o.getPomi() == null ? o.getEomi() : o.getPomi());
if (irrs != null) { // 불규칙동사인 경우
AnalysisOutput output = o.clone();
output.setStem(irrs[0]);
if (output.getPomi() == null)
output.setEomi(irrs[1]);
else
output.setPomi(irrs[1]);
// entry = DictionaryUtil.getVerb(output.getStem());
// if(entry!=null && VerbUtil.constraintVerb(o.getStem(), o.getPomi()==null?o.getEomi():o.getPomi())) { // 4. 돕다 (PTN_VM)
output.setScore(AnalysisOutput.SCORE_CORRECT);
MorphUtil.buildPtnVM(output, candidates);
// }
}
if (VerbUtil.ananlysisNSM(o.clone(), candidates)) return;
if (VerbUtil.ananlysisNSMXM(o.clone(), candidates)) return;
// [체언 + '에서/에서부터' + '이' + 어미]
if (VerbUtil.ananlysisNJCM(o.clone(), candidates)) return;
if (VerbUtil.analysisVMCM(o.clone(), candidates)) return;
VerbUtil.analysisVMXM(o.clone(), candidates);
} catch (CloneNotSupportedException e) {
throw new MorphException(e.getMessage(), e);
}
}
public void analysisCNoun(List<AnalysisOutput> candidates) throws MorphException {
boolean success = false;
for (AnalysisOutput o : candidates) {
if (o.getPos() != PatternConstants.POS_NOUN) continue;
if (o.getScore() == AnalysisOutput.SCORE_CORRECT)
success = true;
else if (!success)
confirmCNoun(o);
}
}
/**
* 복합명사인지 조사하고, 복합명사이면 단위명사들을 찾는다.
* 복합명사인지 여부는 단위명사가 모두 사전에 있는지 여부로 판단한다.
* 단위명사는 2글자 이상 단어에서만 찾는다.
*
* @throws org.apache.lucene.analysis.kr.morph.MorphException
*
*/
public boolean confirmCNoun(AnalysisOutput o) throws MorphException {
if (o.getStem().length() < 3) return false;
WordEntry cnoun = DictionaryUtil.getCNoun(o.getStem());
if (cnoun != null && cnoun.getFeature(WordEntry.IDX_NOUN) == '2') {
o.addCNoun(cnoun.getCompounds());
o.setScore(AnalysisOutput.SCORE_CORRECT);
return true;
}
List<CompoundEntry> results = cnAnalyzer.analyze(o.getStem());
// System.out.println(o);
// for(CompoundEntry c :results) System.out.println(c.getWord()+":"+c.isExist());
boolean success = false;
if (results.size() > 1) {
o.setCNoun(results);
success = true;
for (CompoundEntry entry : results) {
if (!entry.isExist()) success = false;
}
o.setScore(AnalysisOutput.SCORE_COMPOUNDS);
}
if (success) {
if (constraint(o)) {
o.setScore(AnalysisOutput.SCORE_CORRECT);
} else {
o.setScore(AnalysisOutput.SCORE_FAIL);
return false;
}
} else {
if (NounUtil.confirmDNoun(o) && o.getScore() != AnalysisOutput.SCORE_CORRECT) {
confirmCNoun(o);
}
if (o.getScore() == AnalysisOutput.SCORE_CORRECT) success = true;
if (o.getCNounList().size() > 0 && !constraint(o)) o.setScore(AnalysisOutput.SCORE_FAIL);
}
return success;
}
private boolean constraint(AnalysisOutput o) throws MorphException {
List<CompoundEntry> cnouns = o.getCNounList();
if ("화해".equals(cnouns.get(cnouns.size() - 1).getWord())) {
if (!ConstraintUtil.canHaheCompound(cnouns.get(cnouns.size() - 2).getWord())) return false;
} else if (o.getPatn() == PatternConstants.PTN_NSM) {
if ("내".equals(o.getVsfx()) && cnouns.get(cnouns.size() - 1).getWord().length() != 1) {
WordEntry entry = DictionaryUtil.getWord(cnouns.get(cnouns.size() - 1).getWord());
if (entry != null && entry.getFeature(WordEntry.IDX_NE) == '0') return false;
} else if ("하".equals(o.getVsfx()) && cnouns.get(cnouns.size() - 1).getWord().length() == 1) {
// 짝사랑하다 와 같은 경우에 뒷글자가 1글자이면 제외
return false;
}
}
return true;
}
}