package org.fastcatsearch.ir.dictionary; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.core.AnalyzerOption; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.CharsRefTermAttribute; import org.fastcatsearch.ir.analysis.AnalyzerPool; import org.fastcatsearch.ir.io.CharVector; import org.fastcatsearch.util.WordCombination; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; import java.io.StringReader; import java.util.ArrayList; import java.util.List; import java.util.Map; import java.util.Set; /** * Created by swsong on 2015. 7. 30.. */ public class DictionaryUtils { private static Logger logger = LoggerFactory.getLogger(DictionaryUtils.class); private static final AnalyzerOption noOption = new AnalyzerOption(); /** * 제공된 analyzerPool 로 keyword를 분석해 단어 리스트를 만든다. */ public static List<String> makeTermList(AnalyzerPool analyzerPool, String keyword) { Analyzer analyzer = null; List<String> termList = new ArrayList<String>(); TokenStream tokenStream = null; try { analyzer = analyzerPool.getFromPool(); tokenStream = analyzer.tokenStream("", new StringReader(keyword), noOption); tokenStream.reset(); if (tokenStream.hasAttribute(CharsRefTermAttribute.class)) { CharsRefTermAttribute termAttribute = tokenStream.getAttribute(CharsRefTermAttribute.class); while (tokenStream.incrementToken()) { termList.add(termAttribute.charsRef().toString()); } } else { CharTermAttribute termAttribute = tokenStream.getAttribute(CharTermAttribute.class); while (tokenStream.incrementToken()) { termList.add(termAttribute.toString()); } } } catch (IOException e) { logger.error("", e); return null; } finally { if (tokenStream != null) { try { tokenStream.close(); } catch (IOException ignore) { } } analyzerPool.releaseToPool(analyzer); } return termList; } /** * 스트링 리스트를 조합하여 사전에서 찾아본다. * * @Param map 사전맵 * @Param termList 분석기를 통해 만들어진 단어리스트 */ public static List<WordCombination.WordEntryPair> findExtendedKey(Map<CharVector, CharVector[]> map, List<WordCombination.WordEntry> wordList) { List<WordCombination.WordEntryPair> foundPairList = new ArrayList<WordCombination.WordEntryPair>(); if (wordList != null && wordList.size() > 0) { for (WordCombination.WordEntry wordEntry : wordList) { CharVector keyword = new CharVector(wordEntry.getWord()); CharVector[] value = map.get(keyword); if (value != null && value.length > 0) { String foundValue = value[0].toString(); foundPairList.add(new WordCombination.WordEntryPair(wordEntry, foundValue)); } } } return foundPairList; } public static List<WordCombination.WordEntry> findExtendedKey(Set<CharVector> set, List<WordCombination.WordEntry> wordList) { List<WordCombination.WordEntry> foundList = new ArrayList<WordCombination.WordEntry>(); if (wordList != null && wordList.size() > 0) { for (WordCombination.WordEntry wordEntry : wordList) { CharVector keyword = new CharVector(wordEntry.getWord()); if(set.contains(keyword)) { foundList.add(wordEntry); } } } return foundList; } /** * 사전에서 발견된 단어를 제외한 나머지 단어를 모은다. */ public static List<String> makeRemnantList(List<String> termList, List<WordCombination.WordEntryPair> pairList) { if (pairList == null || pairList.size() == 0) { return termList; } List<String> remnant = new ArrayList<String>(); for (String word : termList) { boolean found = false; OUTER: for (WordCombination.WordEntryPair pair : pairList) { WordCombination.WordEntry entry = pair.getEntry(); // 하위 엘리먼트를 확인해서 일치하는것 발견시 제거한다. for (String e : entry.getElements()) { if (e.equals(word)) { found = true; break OUTER; } } } if (!found) { remnant.add(word); } } return remnant; } public static List<String> makeRemnantList2(List<String> termList, List<WordCombination.WordEntry> list) { if (list == null || list.size() == 0) { return termList; } List<String> remnant = new ArrayList<String>(); for (String word : termList) { boolean found = false; OUTER: for (WordCombination.WordEntry entry : list) { // 하위 엘리먼트를 확인해서 일치하는것 발견시 제거한다. for (String e : entry.getElements()) { if (e.equals(word)) { found = true; break OUTER; } } } if (!found) { remnant.add(word); } } return remnant; } public static String joinWordList(List<String> wordList, String delimiter) { if(wordList.size() == 0) { return null; } StringBuffer sb = new StringBuffer(); for(String e : wordList) { if(sb.length() > 0) { sb.append(delimiter); } sb.append(e); } return sb.toString(); } }