package com.tistory.devyongsik.analyzer;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import org.apache.lucene.analysis.kr.morph.AnalysisOutput;
import org.apache.lucene.analysis.kr.morph.CompoundEntry;
import org.apache.lucene.analysis.kr.morph.MorphAnalyzer;
import org.apache.lucene.analysis.kr.morph.MorphException;
import org.apache.lucene.analysis.kr.morph.PatternConstants;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.AttributeSource.State;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* @author need4spd, need4spd@cplanet.co.kr, 2011. 10. 14.
*
*/
public class KoreanMorphEngine implements Engine {
private MorphAnalyzer morph = null;
private Logger logger = LoggerFactory.getLogger(KoreanMorphEngine.class);
public KoreanMorphEngine() {
if(logger.isInfoEnabled()) {
logger.info("init KoreanMorphEngine");
}
morph = new MorphAnalyzer();
}
@Override
public void collectNounState(AttributeSource attributeSource, List<ComparableState> comparableStateList, Map<String, String> returnedTokens)
throws Exception {
CharTermAttribute termAttr = attributeSource.getAttribute(CharTermAttribute.class);
TypeAttribute typeAttr = attributeSource.getAttribute(TypeAttribute.class);
OffsetAttribute offSetAttr = attributeSource.getAttribute(OffsetAttribute.class);
if(!typeAttr.type().equals("word")) {
if(logger.isDebugEnabled()) {
logger.debug("명사 분석 대상이 아닙니다.");
}
return;
}
String term = termAttr.toString();
returnedTokens.put(term+"_"+offSetAttr.startOffset()+"_"+offSetAttr.endOffset(), "");
try {
analysisKorean(attributeSource, comparableStateList, returnedTokens);
} catch (MorphException e) {
logger.error(e.getMessage());
}
}
@SuppressWarnings("unchecked")
private void analysisKorean(AttributeSource attrSource, List<ComparableState> comparableStateList, Map<String, String> returnedTokens) throws MorphException {
if(logger.isDebugEnabled())
logger.debug("analysisKorean");
CharTermAttribute termAttr = attrSource.getAttribute(CharTermAttribute.class);
OffsetAttribute offsetAttr = attrSource.getAttribute(OffsetAttribute.class);
String input = termAttr.toString();
logger.info("morph engine input : " + input);
List<AnalysisOutput> outputs = morph.analyze(input);
//AnalysisOutput에는 각각의 단어에 대해 형태소 정보가 다 들어가 있고 getStem은 명사만 가져옴
//여러개로 추측 하는 경우도 있음
//ex> (에서사랑하고,0,6,type=<KOREAN>) -> [에서사랑하(N),고(j)],[에서사랑(N),하고(j)],[에서사랑하고(N)]
if(logger.isDebugEnabled()) {
for(AnalysisOutput output : outputs) {
logger.debug("outputs : " + "["+ output.getStem() + "] : " + output.getScore());
logger.debug("outputs all info : " + "["+ output + "] : " + output.getPos());
}
}
//map에 추출된 명사/혹은 n-gram 추출 색인어를 넣어두고 나중에
//하나하나를 token으로 koreanQueue에 집어 넣는다.
Map<String,Integer> map = new HashMap<String,Integer>();
for(AnalysisOutput output : outputs) {
//명사(N) 만 처리
if(output.getPos()==PatternConstants.POS_NOUN) {
map.put(output.getStem(), new Integer(1));
//점수가 100점이 아니면 n-gram 처리
if(output.getScore()==AnalysisOutput.SCORE_CORRECT) {
List<CompoundEntry> cnouns = output.getCNounList();
for(CompoundEntry cnoun : cnouns) {
if(cnoun.getWord().length()>1) {
map.put(cnoun.getWord(), new Integer(0));
}
}
}
}
}
Iterator<String> iter = map.keySet().iterator();
State current = attrSource.captureState();
while(iter.hasNext()) {
String text = iter.next();
//원본이 두번 추출되는 것을 막기 위해
if(!input.equals(text)) {
int index = input.indexOf(text);
attrSource.restoreState(current); //attrSource를 다시 이전 상태로 restore
CharTermAttribute termAttrResult = attrSource.addAttribute(CharTermAttribute.class);
termAttrResult.setEmpty();
termAttrResult.append(text);
PositionIncrementAttribute positionAttrResult = attrSource.addAttribute(PositionIncrementAttribute.class);
positionAttrResult.setPositionIncrement(0);
OffsetAttribute offsetAttrResult = attrSource.addAttribute(OffsetAttribute.class);
offsetAttrResult.setOffset(offsetAttr.startOffset() + (index!=-1?index:0), index!=-1?offsetAttr.startOffset()+index+text.length():offsetAttr.endOffset());
TypeAttribute typeAttrResult = attrSource.addAttribute(TypeAttribute.class);
typeAttrResult.setType("morph_noun");
String makeKeyForCheck = text + "_" + offsetAttrResult.startOffset() + "_" + offsetAttrResult.endOffset();
if(returnedTokens.containsKey(makeKeyForCheck)) {
if(logger.isDebugEnabled()) {
logger.debug("["+makeKeyForCheck+"] 는 이미 추출된 Token입니다. Skip");
}
} else {
ComparableState comparableState = new ComparableState();
comparableState.setState(attrSource.captureState());
comparableState.setStartOffset(offsetAttrResult.startOffset());
comparableStateList.add(comparableState);
if(logger.isDebugEnabled())
logger.debug("추출 된 명사 : [" + termAttrResult.toString() + "]");
}
}
}
}
}