package com.tistory.devyongsik.analyzer;
import java.io.IOException;
import java.io.StringReader;
import java.util.HashSet;
import java.util.Set;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Test;
import com.tistory.devyongsik.analyzer.util.AnalyzerTestUtil;
import com.tistory.devyongsik.analyzer.util.TestToken;
/**
*
* @author 장용석, 2011.07.16 need4spd@naver.com
*/
public class KoreanCharacterTokenizerTest extends AnalyzerTestUtil {
private Set<TestToken> tokenizedToken = new HashSet<TestToken>();
private StringReader content = new StringReader("삼성전자absc1234엠피3mp3버전1.2 띄어쓰기");
private KoreanCharacterTokenizer tokenizer = new KoreanCharacterTokenizer(content);
@Before
public void setUp() throws IOException {
tokenizedToken.add(getToken("띄어쓰기", 25, 29));
tokenizedToken.add(getToken("2", 22, 23));
tokenizedToken.add(getToken("1", 20, 21));
tokenizedToken.add(getToken("버전", 18, 20));
tokenizedToken.add(getToken("3",17, 18));
tokenizedToken.add(getToken("mp", 15, 17));
tokenizedToken.add(getToken("3", 14, 15));
tokenizedToken.add(getToken("엠피", 12, 14));
tokenizedToken.add(getToken("1234", 8, 12));
tokenizedToken.add(getToken("absc", 4, 8));
tokenizedToken.add(getToken("삼성전자", 0, 4));
tokenizer.reset();
}
@Test
public void testIncrementToken() throws IOException {
CharTermAttribute charTermAtt = tokenizer.getAttribute(CharTermAttribute.class);
OffsetAttribute offSetAtt = tokenizer.getAttribute(OffsetAttribute.class);
while(tokenizer.incrementToken()) {
TestToken t = getToken(charTermAtt.toString(), offSetAtt.startOffset(), offSetAtt.endOffset());
System.out.println("termAtt.term() : " + charTermAtt.toString());
System.out.println("offSetAtt : " + offSetAtt.startOffset());
System.out.println("offSetAtt : " + offSetAtt.endOffset());
Assert.assertTrue(tokenizedToken.contains(t));
}
}
}