package com.tistory.devyongsik.analyzer;
import java.io.IOException;
import java.io.StringReader;
import java.util.List;
import java.util.Map;
import org.apache.lucene.analysis.TokenStream;
import org.junit.Before;
import org.junit.Test;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.tistory.devyongsik.analyzer.dictionary.DictionaryFactory;
import com.tistory.devyongsik.analyzer.util.AnalyzerTestUtil;
import com.tistory.devyongsik.analyzer.util.TestToken;
public class KoreanStopFilterTest extends AnalyzerTestUtil {
private List<TestToken> tokens = null;
//불용어는 the와 .
private StringReader reader = new StringReader("the 개발하고 꼭 이것을 잘 해야합니다. 공백입니다.");
private DictionaryFactory dictionaryFactory = null;
@Before
public void setUp() {
tokens = Lists.newArrayList();
dictionaryFactory = DictionaryFactory.getFactory();
tokens.add(getToken("공백입니다", 24, 29));
tokens.add(getToken("해야합니다", 17, 22));
tokens.add(getToken("이것을", 11, 14));
tokens.add(getToken("개발하고", 4, 8));
tokens.add(getToken("꼭", 9, 10));
tokens.add(getToken("잘", 15, 16));
}
@Test
public void stopFilter() throws IOException {
Map<String, String> stopWordDictionaryMap = Maps.newHashMap();
stopWordDictionaryMap.put("the", null);
stopWordDictionaryMap.put(".", null);
dictionaryFactory.setStopWordDictionaryMap(stopWordDictionaryMap);
TokenStream stream = new KoreanStopFilter(new KoreanCharacterTokenizer(reader));
stream.reset();
List<TestToken> extractedTokens = collectExtractedNouns(stream);
stream.close();
verify(tokens, extractedTokens);
}
}