package org.apache.lucene.analysis.kr.utils;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.analysis.kr.morph.AnalysisOutput;
import org.apache.lucene.analysis.kr.morph.MorphException;
import org.apache.lucene.analysis.kr.morph.PatternConstants;
import org.apache.lucene.analysis.kr.morph.WordEntry;
public class MorphUtil {
private static final char[] CHOSEONG = {
'ㄱ','ㄲ','ㄴ','ㄷ','ㄸ','ㄹ','ㅁ','ㅂ','ㅃ','ㅅ',
'ㅆ','ㅇ','ㅈ','ㅉ','ㅊ','ㅋ','ㅌ','ㅍ','ㅎ'
};
private static final char[] JUNGSEONG = {
'ㅏ','ㅐ','ㅑ','ㅒ','ㅓ','ㅔ','ㅕ','ㅖ','ㅗ','ㅘ',
'ㅙ','ㅚ','ㅛ','ㅜ','ㅝ','ㅞ','ㅟ','ㅠ','ㅡ','ㅢ',
'ㅣ'
};
private static final char[] JONGSEONG = {
'\0','ㄱ','ㄲ','ㄳ','ㄴ','ㄵ','ㄶ','ㄷ','ㄹ','ㄺ',
'ㄻ','ㄼ','ㄽ','ㄾ','ㄿ','ㅀ','ㅁ','ㅂ','ㅄ','ㅅ',
'ㅆ','ㅇ','ㅈ','ㅊ','ㅋ','ㅌ','ㅍ','ㅎ'
};
private static final int JUNG_JONG = JUNGSEONG.length * JONGSEONG.length;
/**
* 한글 한글자를 초성/중성/종성의 배열로 만들어 반환한다.
* @param c
* @return
*/
public static char[] decompose(char c) {
char[] result = null;
if(c>0xD7A3||c<0xAC00) return new char[]{c};
c -= 0xAC00;
char choseong = CHOSEONG[c/JUNG_JONG];
c = (char)(c % JUNG_JONG);
char jungseong = JUNGSEONG[c/JONGSEONG.length];
char jongseong = JONGSEONG[c%JONGSEONG.length];
if(jongseong != 0) {
result = new char[] {choseong, jungseong, jongseong};
}else {
result = new char[] {choseong, jungseong};
}
return result;
}
public static char compound(int first, int middle, int last) {
return (char)(0xAC00 + first* JUNG_JONG + middle * JONGSEONG.length + last);
}
public static char makeChar(char ch, int mdl, int last) {
ch -= 0xAC00;
int first = ch/JUNG_JONG;
return compound(first,mdl,last);
}
public static char makeChar(char ch, int last) {
ch -= 0xAC00;
int first = ch/JUNG_JONG;
ch = (char)(ch % JUNG_JONG);
int middle = ch/JONGSEONG.length;
return compound(first,middle,last);
}
public static char replaceJongsung(char dest, char source) {
source -= 0xAC00;
int last = source % JONGSEONG.length;
return makeChar(dest,last);
}
/**
* 형태소 유형 출력을 위한 문자열을 생성한다.
* @param word
* @param type
* @return
*/
public static String buildTypeString(String word, char type) {
StringBuffer sb = new StringBuffer();
sb.append(word);
sb.append("(");
sb.append(type);
sb.append(")");
return sb.toString();
}
public static void buildPtnVM(AnalysisOutput output, List candidates) throws MorphException {
String end = output.getEomi();
if(output.getPomi()!=null) end = output.getPomi();
output.setPatn(PatternConstants.PTN_VM);
output.setPos(PatternConstants.POS_VERB);
if(output.getScore()==AnalysisOutput.SCORE_CORRECT) {
candidates.add(output);
}else {
String[] irrs = IrregularUtil.restoreIrregularVerb(output.getStem(),end);
if(irrs!=null) {
output.setScore(AnalysisOutput.SCORE_CORRECT);
output.setStem(irrs[0]);
candidates.add(output);
}
}
}
/**
* 용언 + '음/기' + '이' + 어미, 체언 + '에서/부터/에서부터' + '이' + 어미
* @param output
* @param candidates
* @throws MorphException
*/
public static void buildPtnCM(AnalysisOutput output, List candidates) throws MorphException {
char ch = output.getStem().charAt(output.getStem().length()-2);
char[] jasos = MorphUtil.decompose(ch);
if(jasos.length==3||ch=='기') {
buildPtnVMCM(output,candidates);
} else {
}
}
private static void buildPtnVMCM(AnalysisOutput output, List candidates) throws MorphException {
String stem = output.getStem();
output.setPatn(PatternConstants.PTN_VMCM);
output.setPos(PatternConstants.POS_VERB);
char ch = stem.charAt(stem.length()-2);
char[] jasos = MorphUtil.decompose(ch);
if(ch=='기') {
output.addElist("기");
output.addElist("이");
output.setStem(stem.substring(0,stem.length()-2));
if(DictionaryUtil.getVerb(output.getStem())!=null)
candidates.add(output);
}else if(jasos[2]=='ㅁ') {
if(stem.length()>1) stem = stem.substring(0,stem.length()-2);
stem += MorphUtil.makeChar(ch, 0);
output.addElist("ㅁ");
output.addElist("이");
output.setStem(stem);
if(DictionaryUtil.getVerb(stem)!=null)
candidates.add(output);
else {
String[] morphs = IrregularUtil.restoreIrregularVerb(stem,"ㅁ");
if(morphs!=null) {
output.setScore(AnalysisOutput.SCORE_CORRECT);
output.setStem(morphs[0]);
candidates.add(output);
}
}
}
}
public static boolean hasVerbOnly(String input) throws MorphException {
for(int i=input.length()-1;i>=0;i--) {
char[] feature = SyllableUtil.getFeature(input.charAt(i));
if(feature[SyllableUtil.IDX_WDSURF]=='1'&&input.length()>i) return true;
}
return false;
}
/**
* 시제 선어미말을 만들어서 반환한다.
* @param preword '아' 또는 '어'
* @param endword 어미[선어미말을 포함]
* @return '았' 또는 '었'을 만들어서 반환한다.
*/
public static String makeTesnseEomi(String preword, String endword) {
if(preword==null||preword.length()==0) return endword;
if(endword==null||endword.length()==0) return preword;
if(endword.charAt(0)=='ㅆ') {
return preword.substring(0,preword.length()-1)+
makeChar(preword.charAt(preword.length()-1),20)+endword.substring(1,endword.length());
} else if(endword.charAt(0)=='ㄴ') {
return preword.substring(0,preword.length()-1)+
makeChar(preword.charAt(preword.length()-1),4)+endword.substring(1,endword.length());
} else if(endword.charAt(0)=='ㄹ') {
return preword.substring(0,preword.length()-1)+
makeChar(preword.charAt(preword.length()-1),8)+endword.substring(1,endword.length());
} else if(endword.charAt(0)=='ㅁ') {
return preword.substring(0,preword.length()-1)+
makeChar(preword.charAt(preword.length()-1),16)+endword.substring(1,endword.length());
} else if(endword.charAt(0)=='ㅂ') {
return preword.substring(0,preword.length()-1)+
makeChar(preword.charAt(preword.length()-1),17)+endword.substring(1,endword.length());
}
return preword+endword;
}
/**
* 용언화접미사가 결합될 수 있는지 여부를 점검한다.
* 특히 사전에 등록된 되다, 하다형 의 접속이 가능한지를 조사한다.
* @param o
* @return
*/
public static boolean isValidSuffix(WordEntry entry, AnalysisOutput o) {
return true;
}
}