package com.tistory.devyongsik.analyzer; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Stack; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.TypeAttribute; import org.apache.lucene.util.AttributeSource; import org.apache.lucene.util.AttributeSource.State; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.tistory.devyongsik.analyzer.dictionary.DictionaryFactory; public class KoreanCompoundNounEngine implements Engine { private Logger logger = LoggerFactory.getLogger(KoreanCompoundNounEngine.class); private Map<String, List<String>> compoundNouns = new HashMap<String, List<String>>(); public KoreanCompoundNounEngine() { if(logger.isInfoEnabled()) { logger.info("init KoreanCompoundNounEngine"); } compoundNouns = DictionaryFactory.getFactory().getCompoundDictionaryMap(); } @Override public void collectNounState(AttributeSource attributeSource, Stack<State> nounsStack, Map<String, String> returnedTokens) throws Exception { CharTermAttribute termAttr = attributeSource.getAttribute(CharTermAttribute.class); TypeAttribute typeAttr = attributeSource.getAttribute(TypeAttribute.class); OffsetAttribute offSetAttr = attributeSource.getAttribute(OffsetAttribute.class); PositionIncrementAttribute positionAttr = attributeSource.getAttribute(PositionIncrementAttribute.class); String termString = termAttr.toString(); returnedTokens.put(termString+"_"+offSetAttr.startOffset()+"_"+offSetAttr.endOffset(), ""); //복합명사 사전에 있는 단어면 List<String> matchedData = compoundNouns.get(termString); if(matchedData != null) { typeAttr.setType("compounds"); for(String noun : matchedData) { if(logger.isDebugEnabled()) { logger.debug("복합명사추출 : " + noun); } int startOffSet = termString.indexOf(noun); int endOffSet = startOffSet + noun.length(); String makeKeyForCheck = noun + "_" + startOffSet + "_" + endOffSet; if(returnedTokens.containsKey(makeKeyForCheck)) { if(logger.isDebugEnabled()) { logger.debug("["+makeKeyForCheck+"] 는 이미 추출된 Token입니다. Skip"); } continue; } else { returnedTokens.put(makeKeyForCheck, ""); } termAttr.setEmpty(); termAttr.append(noun); positionAttr.setPositionIncrement(1); offSetAttr.setOffset(startOffSet , endOffSet); typeAttr.setType("compound"); nounsStack.add(attributeSource.captureState()); } } return; } }