package com.tistory.devyongsik.analyzer; import java.util.ArrayList; import java.util.List; import java.util.Map; import java.util.Stack; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.TypeAttribute; import org.apache.lucene.document.Document; import org.apache.lucene.index.Term; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.SearcherManager; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TopScoreDocCollector; import org.apache.lucene.util.AttributeSource; import org.apache.lucene.util.AttributeSource.State; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.tistory.devyongsik.analyzer.dictionary.DictionaryFactory; import com.tistory.devyongsik.analyzer.dictionaryindex.SynonymDictionaryIndex; public class KoreanSynonymEngine implements Engine { private Logger logger = LoggerFactory.getLogger(KoreanSynonymEngine.class); static { DictionaryFactory dictionaryFactory = DictionaryFactory.getFactory(); createSynonymIndex(dictionaryFactory.getSynonymList()); } private static void createSynonymIndex(List<String> synonyms) { SynonymDictionaryIndex indexingModule = SynonymDictionaryIndex.getIndexingModule(); indexingModule.indexingDictionary(synonyms); } private List<String> getWords(String word) throws Exception { List<String> synWordList = new ArrayList<String>(); if(logger.isDebugEnabled()) { logger.debug("동의어 탐색 : " + word); } Query query = new TermQuery(new Term("syn",word)); if(logger.isDebugEnabled()) { logger.debug("query : " + query); } SynonymDictionaryIndex indexingModule = SynonymDictionaryIndex.getIndexingModule(); SearcherManager searcherManager = indexingModule.getSearcherManager(); searcherManager.maybeRefresh(); IndexSearcher indexSearcher = searcherManager.acquire(); TopScoreDocCollector collector = TopScoreDocCollector.create(5 * 5, false); indexSearcher.search(query, collector); ScoreDoc[] hits = collector.topDocs().scoreDocs; if(logger.isDebugEnabled()) { logger.debug("대상 word : " + word); //검색된 document는 하나이므로.. logger.debug("동의어 갯수 : " + hits.length); } for(int i = 0; i < hits.length; i++) { Document doc = indexSearcher.doc(hits[i].doc); String[] values = doc.getValues("syn"); for(int j = 0; j < values.length; j++) { if(logger.isDebugEnabled()) logger.debug("대상 word : " + "["+word+"]" + " 추출된 동의어 : " + values[j]); if(!word.equals(values[j])) { synWordList.add(values[j]); } } } searcherManager.release(indexSearcher); indexSearcher = null; return synWordList; } @Override public void collectNounState(AttributeSource attributeSource, List<ComparableState> comparableStateList, Map<String, String> returnedTokens) throws Exception { CharTermAttribute charTermAttr = attributeSource.getAttribute(CharTermAttribute.class); OffsetAttribute offSetAttr = attributeSource.getAttribute(OffsetAttribute.class); returnedTokens.put(charTermAttr.toString()+"_"+offSetAttr.startOffset()+"_"+offSetAttr.endOffset(), ""); if(logger.isDebugEnabled()) logger.debug("넘어온 Term : " + charTermAttr.toString()); List<String> synonyms = getWords(charTermAttr.toString()); if (synonyms.size() == 0) new Stack<State>(); //동의어 없음 for (int i = 0; i < synonyms.size(); i++) { String synonymWord = synonyms.get(i); String makeKeyForCheck = synonymWord + "_" + offSetAttr.startOffset() + "_" + offSetAttr.endOffset(); if(returnedTokens.containsKey(makeKeyForCheck)) { if(logger.isDebugEnabled()) { logger.debug("["+makeKeyForCheck+"] 는 이미 추출된 Token입니다. Skip"); } continue; } else { returnedTokens.put(makeKeyForCheck, ""); } //#1. 동의어는 키워드 정보와 Type정보, 위치증가정보만 변경되고 나머지 속성들은 원본과 동일하기 때문에 //attributeSource로부터 변경이 필요한 정보만 가져와서 필요한 정보를 변경한다. //offset은 원본과 동일하기 때문에 건드리지 않는다. CharTermAttribute attr = attributeSource.getAttribute(CharTermAttribute.class); //원본을 복사한 AttributeSource의 Attribute를 받아옴 attr.setEmpty(); attr.append(synonyms.get(i)); PositionIncrementAttribute positionAttr = attributeSource.getAttribute(PositionIncrementAttribute.class); //원본 AttributeSource의 Attribute를 받아옴 positionAttr.setPositionIncrement(0); //동의어이기 때문에 위치정보 변하지 않음 TypeAttribute typeAtt = attributeSource.getAttribute(TypeAttribute.class); //원본 AttributeSource의 Attribute를 받아옴 //타입을 synonym으로 설정한다. 나중에 명사추출 시 동의어 타입은 건너뛰기 위함 typeAtt.setType("synonym"); ComparableState comparableState = new ComparableState(); comparableState.setState(attributeSource.captureState()); comparableState.setStartOffset(offSetAttr.startOffset()); comparableStateList.add(comparableState); } return; } }