package org.fastcatsearch.ir.analysis; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.core.AnalyzerOption; import org.apache.lucene.analysis.tokenattributes.CharsRefTermAttribute; import org.junit.Test; import java.io.IOException; import java.io.StringReader; public class NGramAnalyzerTest { @Test public void testTokenizer() throws IOException { int minGram = 2; int maxGram = 5; String str = "A Epic Tale of a Moose And a Girl who must Confront a Monkey in Ancient India"; str = "대형폐가전제품무상방문 수거사업"; // str = "사례에 대해서는 단언가 쪼개지지 않아서 그런 것 같다 라고 말씀 드리고,\n" // +"사용자 사전에 “가전”을 등록하면 검색이 가능해질거라고 말씀 드렸더니 \n" // +"매번 검색이 안될때마다 인식 하지 못한 단어를 등록할 수는 없다.. 라고 하시네요."; StringReader input = new StringReader(str); NGramWordTokenizer t = new NGramWordTokenizer(input, minGram, maxGram); t.reset(); CharsRefTermAttribute charTermAttribute = t.getAttribute(CharsRefTermAttribute.class); int i = 1; while(t.incrementToken()) { System.out.println(i++ +">"+charTermAttribute.toString()+"<"); } } @Test public void testAnalyzerForDocument() throws IOException, InterruptedException { String str = "대형폐가전제품무상방문 수거사업"; NGramWordAnalyzer analyzer = new NGramWordAnalyzer(); StringReader reader = new StringReader(str); TokenStream tokenStream = analyzer.tokenStream("1", reader); CharsRefTermAttribute charTermAttribute = tokenStream.getAttribute(CharsRefTermAttribute.class); int i = 1; while(tokenStream.incrementToken()) { System.out.println(i++ +">"+charTermAttribute.toString()+"<"); } System.out.println("=============================================="); str = "대형폐가전제품무상방문 수거사업"; StringReader reader2 = new StringReader(str); TokenStream tokenStream2 = analyzer.tokenStream("2", reader2); CharsRefTermAttribute charTermAttribute2 = tokenStream2.getAttribute(CharsRefTermAttribute.class); int i2 = 1; while(tokenStream2.incrementToken()) { System.out.println(i2++ +">>>"+charTermAttribute2.toString()+"<"); } } @Test public void testAnalyzerForQuery() throws IOException, InterruptedException { AnalyzerOption analyzerOption = new AnalyzerOption(); analyzerOption.setForQuery(); String str = "대형 폐가전 제품무상방문 수거사업"; NGramWordAnalyzer analyzer = new NGramWordAnalyzer(); StringReader reader = new StringReader(str); TokenStream tokenStream = analyzer.tokenStream("1", reader, analyzerOption); tokenStream.reset(); CharsRefTermAttribute charTermAttribute = tokenStream.getAttribute(CharsRefTermAttribute.class); int i = 1; while(tokenStream.incrementToken()) { System.out.println(i++ +">"+charTermAttribute.toString()+"<"); } System.out.println("=============================================="); StringReader reader2 = new StringReader(str); TokenStream tokenStream2 = analyzer.tokenStream("2", reader2, analyzerOption); tokenStream2.reset(); CharsRefTermAttribute charTermAttribute2 = tokenStream2.getAttribute(CharsRefTermAttribute.class); int i2 = 1; while(tokenStream2.incrementToken()) { System.out.println(i2++ +">>>"+charTermAttribute2.toString()+"<"); } } @Test public void testTokenizerWithEdge() throws IOException { int minGram = 1; int maxGram = 3; boolean includeEdge = true; String str = "대형폐가전제품"; StringReader input = new StringReader(str); NGramWordTokenizer t = new NGramWordTokenizer(input, minGram, maxGram, includeEdge); t.reset(); CharsRefTermAttribute charTermAttribute = t.getAttribute(CharsRefTermAttribute.class); int i = 1; while(t.incrementToken()) { System.out.println(i++ +">"+charTermAttribute.toString()+"<"); } } }