/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.kr.tagging;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import org.apache.lucene.analysis.kr.morph.AnalysisOutput;
import org.apache.lucene.analysis.kr.morph.MorphException;
import org.apache.lucene.analysis.kr.morph.PatternConstants;
import org.apache.lucene.analysis.kr.utils.ConstraintUtil;
import org.apache.lucene.analysis.kr.utils.FileUtil;
import org.apache.lucene.analysis.kr.utils.KoreanEnv;
import org.apache.lucene.analysis.kr.utils.StringUtil;
import org.apache.lucene.analysis.kr.utils.Trie;
/**
* 여러개의 형태소분석 결과 중에 최적의 것을 선택한다.
* 이 함수는 문장단위로 호출되어야 한다.
* @author user
*
*/
public class Tagger {
private static Trie<String, String[]> occurrences;
private static final String tagDicLoc = "tagger.dic";
private static final String NILL = "NILL";
private static final String NOPATN = "0";
private AnalysisOutput po;
public AnalysisOutput tagging(String psource, List<AnalysisOutput> pmorphs) throws MorphException {
return tagging(psource, null, pmorphs, null);
}
public AnalysisOutput tagging(String psource, String rsource, List<AnalysisOutput> pmorphs, List<AnalysisOutput> rmorphs) throws MorphException {
if((pmorphs==null||pmorphs.size()==0)&&(rmorphs==null||rmorphs.size()==0)) return null;
po = lookupBest(psource, rsource, pmorphs, rmorphs);
po.setSource(psource);
return po;
}
/**
* po가 NULL이 아닌 경우만 호출된다.
* occurrence.dic 에 등록되어 있는 경우만.. 최적을 찾아서 반환한다.
* 1. 첫번째는 어간으로 시작되는 문법 규칙을 찾는다.
* 2. 두번째는 표층형으로 시작되는 문법규칙을 찾는다.
* @param morphs
* @return
*/
private AnalysisOutput lookupBest(String psource,String rsource, List<AnalysisOutput> pmorphs, List<AnalysisOutput> rmorphs) throws MorphException {
if(pmorphs.size()==1) return pmorphs.get(0);
AnalysisOutput select = null;
if(rmorphs!=null&&rmorphs.size()!=0) select = lookupBestByRWord(psource, rsource, pmorphs, rmorphs);
if(select!=null) return select;
if(po!=null) select = lookupBestByPWord(psource, pmorphs);
if(select!=null) return select;
return pmorphs.get(0);
}
/**
* 앞 어절에 의해 현재 어절을 결정한다.
* 앞 어절은 NULL이 아니다.
* @param source
* @param pmorphs
* @param rmorphs
* @return
* @throws MorphException
*/
private AnalysisOutput lookupBestByPWord(String rsource, List<AnalysisOutput> rmorphs) throws MorphException {
List<AnalysisOutput> removes = new ArrayList();
for(AnalysisOutput morph : rmorphs) {
Iterator<String[]> iterw = getGR("F"+rsource+"^W");
AnalysisOutput best = selectBest(iterw, po.getSource(), rsource, po, morph, true, removes);
if(best!=null) return best;
Iterator<String[]> iters = getGR("F"+morph.getStem()+"^S");
best = selectBest(iters, po.getSource(), rsource, po, morph, true, removes);
if(best!=null) return best;
}
for(AnalysisOutput morph : removes) {
if(rmorphs.size()>1) rmorphs.remove(morph);
}
return null;
}
/**
* 뒷 어절에 의해 현재 어절이 결정된다.
* 뒷 어절은 NULL이 아니다.
* @param source
* @param pmorphs
* @param rmorphs
* @return
* @throws MorphException
*/
private AnalysisOutput lookupBestByRWord(String psource, String rsource, List<AnalysisOutput> pmorphs, List<AnalysisOutput> rmorphs) throws MorphException {
List<AnalysisOutput> removes = new ArrayList();
for(AnalysisOutput rmorph : rmorphs) {
if(rmorph.getScore()!=AnalysisOutput.SCORE_CORRECT) break;
String rend = rmorph.getJosa();
if(rend==null) rend = rmorph.getEomi();
for(AnalysisOutput pmorph : pmorphs) {
Iterator<String[]> iterw = getGR("R"+psource+"^W/");
String pend = pmorph.getJosa();
if(pend==null) pend = pmorph.getEomi();
AnalysisOutput best = selectBest(iterw, psource, rsource, pmorph, rmorph, false, removes);
if(best!=null) return best;
Iterator<String[]> iters = getGR("R"+NILL+"/"+pend+"/");
best = selectBest(iters, psource, rsource, pmorph, rmorph, false, removes);
if(best!=null) return best;
iters = getGR("R"+pmorph.getStem()+"^S/");
best = selectBest(iters, psource, rsource, pmorph, rmorph, false, removes);
if(best!=null) return best;
}
}
for(AnalysisOutput morph : removes) {
if(pmorphs.size()>1) pmorphs.remove(morph);
}
return null;
}
private AnalysisOutput selectBest(Iterator<String[]> iter, String psource, String rsource,
AnalysisOutput pmorph, AnalysisOutput rmorph, boolean rear, List removes) {
while(iter.hasNext()) {
String[] values = iter.next();
if(checkGrammer(values, psource, rsource, pmorph, rmorph, rear)) {
if(rear) return rmorph;
else return pmorph;
} else if("1".equals(values[6])) {
if(!removes.contains(pmorph)) removes.add(pmorph);
break;
}
}
return null;
}
private boolean checkGrammer(String[] values, String psource, String rsource, AnalysisOutput pmorph, AnalysisOutput rmorph, boolean depFront) {
boolean ok = true;
String pend = pmorph.getJosa();
if(pend==null) pend = pmorph.getEomi();
String rend = rmorph.getJosa();
if(rend==null) rend = rmorph.getEomi();
if(depFront&&!NILL.equals(values[0])&&!checkWord(psource,values[0],pmorph)) { // 앞 어절의 어휘
return false;
}
if(!NILL.equals(values[1])&& !checkEomi(values[1], pend)) { // 앞 어절의 어미
return false;
}
if(!NOPATN.equals(values[2])&&!checkPattern(values[2], pmorph.getPatn())) {// 앞 어절의 패턴
return false;
}
if(!depFront&&!NILL.equals(values[3])&&!checkWord(rsource,values[3],rmorph)) { // 뒷 어절의 어휘
return false;
}
if(!NILL.equals(values[4])&& !checkEomi(values[4], rend)) { // 뒷 어절의 어미
return false;
}
if(!NOPATN.equals(values[5]) && !checkPattern(values[5], rmorph.getPatn())) { // 뒷 어절의 패턴
return false;
}
return true;
}
private boolean checkWord(String source, String value, AnalysisOutput morph) {
String[] types = StringUtil.split(value,"^");
String[] strs = StringUtil.split(types[0],",");
String text = source;
if("S".equals(types[1])) text = morph.getStem();
for(int i=0;i<strs.length;i++) {
if(strs[i].equals(text)) return true;
}
return false;
}
private boolean checkEomi(String value, String rend) {
String[] strs = StringUtil.split(value,",");
for(int i=0;i<strs.length;i++) {
if(strs[i].equals(rend)) return true;
}
return false;
}
private boolean checkPattern(String value, int ptn) {
String[] strs = StringUtil.split(value,",");
String strPtn = Integer.toString(ptn);
for(int i=0;i<strs.length;i++) {
if("E".equals(strs[i])&&ConstraintUtil.isEomiPhrase(ptn))
return true;
else if("J".equals(strs[i])&&
(ConstraintUtil.isJosaNounPhrase(ptn)||ptn==PatternConstants.PTN_N))
return true;
else if(strs[i].equals(strPtn))
return true;
}
return false;
}
public static synchronized Iterator<String[]> getGR(String prefix) throws MorphException {
if(occurrences==null) loadTaggerDic();
return occurrences.getPrefixedBy(prefix);
}
private static synchronized void loadTaggerDic() throws MorphException {
occurrences = new Trie(true);
try {
List<String> strs = FileUtil.readLines(KoreanEnv.getInstance().getValue(tagDicLoc), "UTF-8");
for(String str : strs) {
if(str==null) continue;
str = str.trim();
String[] syls = StringUtil.split(str,":");
if(syls.length!=4) continue;
String key = null;
if("F".equals(syls[0])) key = syls[2].substring(0,syls[2].lastIndexOf("/")+1) + syls[1].substring(0,syls[1].lastIndexOf("/"));
else key = syls[1].substring(0,syls[1].lastIndexOf("/")+1) + syls[2].substring(0,syls[2].lastIndexOf("/"));
String[] patns = StringUtil.split(syls[1]+"/"+syls[2]+"/"+syls[3],"/");
occurrences.add(syls[0]+key, patns);
}
} catch (Exception e) {
throw new MorphException("Fail to read the tagger dictionary.("+tagDicLoc+")\n"+e.getMessage());
}
}
}