/* * Copyright 2011-2013 the original author or authors. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.analysis.kr.utils; import org.apache.lucene.analysis.kr.morph.AnalysisOutput; import org.apache.lucene.analysis.kr.morph.MorphException; import org.apache.lucene.analysis.kr.morph.PatternConstants; import org.apache.lucene.analysis.kr.morph.WordEntry; import java.util.List; public class MorphUtil { private static final char[] CHOSEONG = { 'ㄱ', 'ㄲ', 'ㄴ', 'ㄷ', 'ㄸ', 'ㄹ', 'ㅁ', 'ㅂ', 'ㅃ', 'ㅅ', 'ㅆ', 'ㅇ', 'ㅈ', 'ㅉ', 'ㅊ', 'ㅋ', 'ㅌ', 'ㅍ', 'ㅎ' }; private static final char[] JUNGSEONG = { 'ㅏ', 'ㅐ', 'ㅑ', 'ㅒ', 'ㅓ', 'ㅔ', 'ㅕ', 'ㅖ', 'ㅗ', 'ㅘ', 'ㅙ', 'ㅚ', 'ㅛ', 'ㅜ', 'ㅝ', 'ㅞ', 'ㅟ', 'ㅠ', 'ㅡ', 'ㅢ', 'ㅣ' }; private static final char[] JONGSEONG = { '\0', 'ㄱ', 'ㄲ', 'ㄳ', 'ㄴ', 'ㄵ', 'ㄶ', 'ㄷ', 'ㄹ', 'ㄺ', 'ㄻ', 'ㄼ', 'ㄽ', 'ㄾ', 'ㄿ', 'ㅀ', 'ㅁ', 'ㅂ', 'ㅄ', 'ㅅ', 'ㅆ', 'ㅇ', 'ㅈ', 'ㅊ', 'ㅋ', 'ㅌ', 'ㅍ', 'ㅎ' }; private static final int JUNG_JONG = JUNGSEONG.length * JONGSEONG.length; /** 한글 한글자를 초성/중성/종성의 배열로 만들어 반환한다. */ public static char[] decompose(char c) { char[] result = null; if (c > 0xD7A3 || c < 0xAC00) return new char[] { c }; c -= 0xAC00; char choseong = CHOSEONG[c / JUNG_JONG]; c = (char) (c % JUNG_JONG); char jungseong = JUNGSEONG[c / JONGSEONG.length]; char jongseong = JONGSEONG[c % JONGSEONG.length]; if (jongseong != 0) { result = new char[] { choseong, jungseong, jongseong }; } else { result = new char[] { choseong, jungseong }; } return result; } public static char compound(int first, int middle, int last) { return (char) (0xAC00 + first * JUNG_JONG + middle * JONGSEONG.length + last); } public static char makeChar(char ch, int mdl, int last) { ch -= 0xAC00; int first = ch / JUNG_JONG; return compound(first, mdl, last); } public static char makeChar(char ch, int last) { ch -= 0xAC00; int first = ch / JUNG_JONG; ch = (char) (ch % JUNG_JONG); int middle = ch / JONGSEONG.length; return compound(first, middle, last); } public static char replaceJongsung(char dest, char source) { source -= 0xAC00; int last = source % JONGSEONG.length; return makeChar(dest, last); } /** 형태소 유형 출력을 위한 문자열을 생성한다. */ public static String buildTypeString(String word, char type) { return word + "(" + type + ")"; // StringBuilder sb = new StringBuilder(); // sb.append(word); // sb.append("("); // sb.append(type); // sb.append(")"); // // return sb.toString(); } public static void buildPtnVM(AnalysisOutput output, List<AnalysisOutput> candidates) throws MorphException { String end = output.getEomi(); if (output.getPomi() != null) end = output.getPomi(); output.setPatn(PatternConstants.PTN_VM); output.setPos(PatternConstants.POS_VERB); if (output.getScore() == AnalysisOutput.SCORE_CORRECT) { candidates.add(output); } else { String[] irrs = IrregularUtil.restoreIrregularVerb(output.getStem(), end); if (irrs != null) { output.setScore(AnalysisOutput.SCORE_CORRECT); output.setStem(irrs[0]); candidates.add(output); } } } /** * 용언 + '음/기' + '이' + 어미, 체언 + '에서/부터/에서부터' + '이' + 어미 * * @throws org.apache.lucene.analysis.kr.morph.MorphException * */ public static void buildPtnCM(AnalysisOutput output, List<AnalysisOutput> candidates) throws MorphException { char ch = output.getStem().charAt(output.getStem().length() - 2); char[] jasos = MorphUtil.decompose(ch); if (jasos.length == 3 || ch == '기') { buildPtnVMCM(output, candidates); } } private static void buildPtnVMCM(AnalysisOutput output, List<AnalysisOutput> candidates) throws MorphException { String stem = output.getStem(); output.setPatn(PatternConstants.PTN_VMCM); output.setPos(PatternConstants.POS_VERB); char ch = stem.charAt(stem.length() - 2); char[] jasos = MorphUtil.decompose(ch); if (ch == '기') { output.addElist("기"); output.addElist("이"); output.setStem(stem.substring(0, stem.length() - 2)); if (DictionaryUtil.getVerb(output.getStem()) != null) candidates.add(output); } else if (jasos[2] == 'ㅁ') { if (stem.length() > 1) stem = stem.substring(0, stem.length() - 2); stem += MorphUtil.makeChar(ch, 0); output.addElist("ㅁ"); output.addElist("이"); output.setStem(stem); if (DictionaryUtil.getVerb(stem) != null) candidates.add(output); else { String[] morphs = IrregularUtil.restoreIrregularVerb(stem, "ㅁ"); if (morphs != null) { output.setScore(AnalysisOutput.SCORE_CORRECT); output.setStem(morphs[0]); candidates.add(output); } } } } public static boolean hasVerbOnly(String input) throws MorphException { for (int i = input.length() - 1; i >= 0; i--) { char[] feature = SyllableUtil.getFeature(input.charAt(i)); if (feature[SyllableUtil.IDX_WDSURF] == '1' && input.length() > i) return true; } return false; } /** * 시제 선어미말을 만들어서 반환한다. * * @param preword '아' 또는 '어' * @param endword 어미[선어미말을 포함] * @return '았' 또는 '었'을 만들어서 반환한다. */ public static String makeTesnseEomi(String preword, String endword) { if (preword == null || preword.length() == 0) return endword; if (endword == null || endword.length() == 0) return preword; if (endword.charAt(0) == 'ㅆ') { return preword.substring(0, preword.length() - 1) + makeChar(preword.charAt(preword.length() - 1), 20) + endword.substring(1, endword.length()); } else if (endword.charAt(0) == 'ㄴ') { return preword.substring(0, preword.length() - 1) + makeChar(preword.charAt(preword.length() - 1), 4) + endword.substring(1, endword.length()); } else if (endword.charAt(0) == 'ㄹ') { return preword.substring(0, preword.length() - 1) + makeChar(preword.charAt(preword.length() - 1), 8) + endword.substring(1, endword.length()); } else if (endword.charAt(0) == 'ㅁ') { return preword.substring(0, preword.length() - 1) + makeChar(preword.charAt(preword.length() - 1), 16) + endword.substring(1, endword.length()); } else if (endword.charAt(0) == 'ㅂ') { return preword.substring(0, preword.length() - 1) + makeChar(preword.charAt(preword.length() - 1), 17) + endword.substring(1, endword.length()); } return preword + endword; } /** * 용언화접미사가 결합될 수 있는지 여부를 점검한다. * 특히 사전에 등록된 되다, 하다형 의 접속이 가능한지를 조사한다. */ public static boolean isValidSuffix(WordEntry entry, AnalysisOutput o) { return true; } }