/*
* Copyright 2011-2013 the original author or authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.kr.utils;
import org.apache.lucene.analysis.kr.morph.AnalysisOutput;
import org.apache.lucene.analysis.kr.morph.MorphException;
import org.apache.lucene.analysis.kr.morph.PatternConstants;
import org.apache.lucene.analysis.kr.morph.WordEntry;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
public class VerbUtil {
private static final Logger log = LoggerFactory.getLogger(VerbUtil.class);
private static final boolean isTraceEnabled = log.isTraceEnabled();
private static final boolean isDebugEnabled = log.isDebugEnabled();
public static final Map<String, String> verbSuffix;
public static final Map<String, String> xVerb;
private static final String[] suffixs = { "이", "하", "되", "내", "나", "스럽", "시키", "있", "없", "같", "당하", "만하", "드리", "받", "짓" };
private static final String[] xverbs = { "오", "내", "주", "보", "지", "오르", "올리" };
static {
verbSuffix = new HashMap<String, String>();
for (String suffix : suffixs)
verbSuffix.put(suffix, suffix);
xVerb = new HashMap<String, String>();
for (String xverb : xverbs)
xVerb.put(xverb, xverb);
}
/** 어간이 용언화접미사로 끝나면 index 를 반환한다. 아니면 -1을 반환한다. */
public static int endsWithVerbSuffix(String stem) {
if (isTraceEnabled)
log.trace("용언화접미사의 index를 반환합니다. stem=[{}]", stem);
if (stem == null || stem.length() < 2) return -1;
int len = stem.length();
int start = 2;
if (len == 2) start = 1;
for (int i = start; i > 0; i--) { // suffix 의 가장 긴 글자수가 2이다.
if (verbSuffix.get(stem.substring(len - i)) != null) return (len - i);
}
return -1;
}
/** 어간부에 보조용언 [하,되,오,내,주,지]가 있는지 조사한다. */
public static int endsWithXVerb(String stem) {
if (isTraceEnabled)
log.trace("보조용언의 index를 반환합니다. stem=[{}]", stem);
if (stem == null || stem.length() < 2) return -1;
int len = stem.length();
int start = 2;
if (len == 2) start = 1;
for (int i = start; i > 0; i--) { //xverbs 의 가장 긴 글자수는 2이다.
if (xVerb.get(stem.substring(len - i)) != null)
return (len - i);
}
return -1;
}
public static boolean verbSuffix(String stem) {
return verbSuffix.get(stem) != null;
}
public static boolean constraintVerb(String start, String end) {
char[] schs = MorphUtil.decompose(start.charAt(start.length() - 1));
char[] echs = MorphUtil.decompose(end.charAt(0));
return !(schs.length == 3 && schs[2] == 'ㄹ' && echs[0] == 'ㄹ');
}
/** 3. 학교에서이다 : 체언 + '에서/부터/에서부터' + '이' + 어미 (PTN_NJCM) <br> */
public static boolean ananlysisNJCM(AnalysisOutput o, List<AnalysisOutput> candidates) throws MorphException {
int strlen = o.getStem().length();
boolean success = false;
if (strlen > 3 && (o.getStem().endsWith("에서이") || o.getStem().endsWith("부터이"))) {
o.addElist(o.getStem().substring(strlen - 1));
o.setJosa(o.getStem().substring(strlen - 3, strlen - 1));
o.setStem(o.getStem().substring(0, strlen - 3));
success = true;
} else if (strlen > 5 && (o.getStem().endsWith("에서부터이"))) {
o.addElist(o.getStem().substring(strlen - 1));
o.setJosa(o.getStem().substring(strlen - 5, strlen - 1));
o.setStem(o.getStem().substring(0, strlen - 5));
success = true;
}
if (!success) return false;
if (DictionaryUtil.getNoun(o.getStem()) != null) {
o.setScore(AnalysisOutput.SCORE_CORRECT);
// }else {
// NounUtil.confirmCNoun(o);
}
o.setPatn(PatternConstants.PTN_NJCM);
o.setPos(PatternConstants.POS_NOUN);
candidates.add(o);
return true;
}
/**
* 어미부와 어간부가 분리된 상태에서 용언화접미사가 결합될 수 있는지 조사한다.
*
* @param o 어미부와 어간부가 분리된 결과
* @param candidates 결과
* @throws org.apache.lucene.analysis.kr.morph.MorphException
*
*/
public static boolean ananlysisNSM(AnalysisOutput o, List<AnalysisOutput> candidates) throws MorphException {
if (o.getStem().endsWith("스러우")) o.setStem(o.getStem().substring(0, o.getStem().length() - 3) + "스럽");
int idxVbSfix = VerbUtil.endsWithVerbSuffix(o.getStem());
if (idxVbSfix < 1) return false;
o.setVsfx(o.getStem().substring(idxVbSfix));
o.setStem(o.getStem().substring(0, idxVbSfix));
o.setPatn(PatternConstants.PTN_NSM);
o.setPos(PatternConstants.POS_NOUN);
WordEntry entry = DictionaryUtil.getWordExceptVerb(o.getStem());
// if(entry==null&&NounUtil.confirmCNoun(o)&&o.getCNounList().size()>0) {
// entry = DictionaryUtil.getNoun(o.getCNounList().get(o.getCNounList().size()-1).getWord());
// }
// if(entry==null) return false;
// if(entry==null) {
// NounUtil.confirmDNoun(o);
// if(o.getScore()!=AnalysisOutput.SCORE_CORRECT) return false;
// }
if (entry != null) {
if (entry.getFeature(WordEntry.IDX_NOUN) == '0') return false;
else if (o.getVsfx().equals("하") && entry.getFeature(WordEntry.IDX_DOV) != '1') return false;
else if (o.getVsfx().equals("되") && entry.getFeature(WordEntry.IDX_BEV) != '1') return false;
else if (o.getVsfx().equals("내") && entry.getFeature(WordEntry.IDX_NE) != '1') return false;
o.setScore(AnalysisOutput.SCORE_CORRECT); // '입니다'인 경우 인명 등 미등록어가 많이 발생되므로 분석성공으로 가정한다.
} else {
o.setScore(AnalysisOutput.SCORE_ANALYSIS); // '입니다'인 경우 인명 등 미등록어가 많이 발생되므로 분석성공으로 가정한다.
}
candidates.add(o);
return true;
}
public static boolean ananlysisNSMXM(AnalysisOutput o, List<AnalysisOutput> candidates) throws MorphException {
int idxXVerb = VerbUtil.endsWithXVerb(o.getStem());
if (idxXVerb == -1) return false;
String eogan = o.getStem().substring(0, idxXVerb);
String[] stomis = null;
if ((eogan.endsWith("아") || eogan.endsWith("어")) && eogan.length() > 1)
stomis = EomiUtil.splitEomi(eogan.substring(0, eogan.length() - 1), eogan.substring(eogan.length() - 1));
else
stomis = EomiUtil.splitEomi(eogan, "");
if (stomis[0] == null) return false;
o.addElist(stomis[1]);
int idxVbSfix = VerbUtil.endsWithVerbSuffix(stomis[0]);
if (idxVbSfix == -1) return false;
o.setXverb(o.getStem().substring(idxXVerb));
o.setVsfx(stomis[0].substring(idxVbSfix));
o.setStem(stomis[0].substring(0, idxVbSfix));
o.setPatn(PatternConstants.PTN_NSMXM);
o.setPos(PatternConstants.POS_NOUN);
WordEntry entry = DictionaryUtil.getNoun(o.getStem());
// if(entry==null&&NounUtil.confirmCNoun(o)&&o.getCNounList().size()>0) {
// entry = DictionaryUtil.getNoun(o.getCNounList().get(o.getCNounList().size()-1));
// }
if (entry == null) return false;
if (o.getVsfx().equals("하") && entry.getFeature(WordEntry.IDX_DOV) != '1') return false;
if (o.getVsfx().equals("되") && entry.getFeature(WordEntry.IDX_BEV) != '1') return false;
o.setScore(AnalysisOutput.SCORE_CORRECT);
candidates.add(o);
return true;
}
public static boolean analysisVMCM(AnalysisOutput o, List<AnalysisOutput> candidates) throws MorphException {
int strlen = o.getStem().length();
if (strlen < 2) return false;
if (!o.getStem().endsWith("이")) return false;
char[] chrs = MorphUtil.decompose(o.getStem().charAt(strlen - 2));
boolean success = false;
if (strlen > 2 && o.getStem().endsWith("기이")) {
o.setStem(o.getStem().substring(0, strlen - 2));
o.addElist("기");
success = true;
} else if (chrs.length > 2 && chrs[2] == 'ㅁ') {
String[] eres = EomiUtil.splitEomi(o.getStem().substring(0, strlen - 1), "");
if (eres[0] == null) return false;
o.addElist(eres[1]);
String[] irrs = IrregularUtil.restoreIrregularVerb(eres[0], eres[1]);
if (irrs != null) o.setStem(irrs[0]);
else o.setStem(eres[0]);
success = true;
}
if (success) {
o.addElist("이");
if (DictionaryUtil.getVerb(o.getStem()) != null) {
o.setPos(PatternConstants.POS_VERB);
o.setPatn(PatternConstants.PTN_VMCM);
o.setScore(AnalysisOutput.SCORE_CORRECT);
candidates.add(o);
return true;
}
}
return false;
}
/**
* 6. 도와주다 : 용언 + '아/어' + 보조용언 + 어미 (PTN_VMXM)
*
* @throws org.apache.lucene.analysis.kr.morph.MorphException
*
*/
public static boolean analysisVMXM(AnalysisOutput o, List<AnalysisOutput> candidates) throws MorphException {
int idxXVerb = VerbUtil.endsWithXVerb(o.getStem());
if (idxXVerb == -1) return false;
o.setXverb(o.getStem().substring(idxXVerb));
String eogan = o.getStem().substring(0, idxXVerb);
String[] stomis = null;
if (eogan.endsWith("아") || eogan.endsWith("어")) {
stomis = EomiUtil.splitEomi(eogan.substring(0, eogan.length() - 1), eogan.substring(eogan.length() - 1));
if (stomis[0] == null) return false;
} else {
stomis = EomiUtil.splitEomi(eogan, "");
if (stomis[0] == null || !(stomis[1].startsWith("아") || stomis[1].startsWith("어"))) return false;
}
String[] irrs = IrregularUtil.restoreIrregularVerb(stomis[0], stomis[1]);
if (irrs != null) {
o.setStem(irrs[0]);
o.addElist(irrs[1]);
} else {
o.setStem(stomis[0]);
o.addElist(stomis[1]);
}
if (DictionaryUtil.getVerb(o.getStem()) != null) {
o.setPos(PatternConstants.POS_VERB);
o.setPatn(PatternConstants.PTN_VMXM);
o.setScore(AnalysisOutput.SCORE_CORRECT);
candidates.add(o);
return true;
}
return false;
}
}