/* * Copyright 2008-2011 Grant Ingersoll, Thomas Morton and Drew Farris * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ------------------- * To purchase or learn more about Taming Text, by Grant Ingersoll, Thomas Morton and Drew Farris, visit * http://www.manning.com/ingersoll */ package com.tamingtext.texttamer.solr; import java.io.File; import java.io.IOException; import java.io.StringReader; import java.util.HashMap; import java.util.Map; import junit.framework.TestCase; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.junit.Test; import com.tamingtext.TamingTextTestJ4; public class SentenceTokenizerTest extends TamingTextTestJ4 { @Test public void tokenizerTest() throws IOException { String inputString = "A man, a plan, a canal, Panama! " + "No matter where you go, there you are. " + "You are in a maze of twisty, little, passages. " + "Use the force Luke. "; String[] expectedStrings = { "A man, a plan, a canal, Panama!", "No matter where you go, there you are.", "You are in a maze of twisty, little, passages.", "Use the force Luke." }; File modelsDir = getModelDir(); SentenceTokenizerFactory factory = new SentenceTokenizerFactory(); Map<String, String> args = new HashMap<String, String>(); args.put("modelDirectory", modelsDir.getAbsolutePath()); factory.init(args); SentenceTokenizer tok = factory.create(new StringReader(inputString)); CharTermAttribute cta; PositionIncrementAttribute pta; OffsetAttribute oa; int pass = 0; while (pass < 2) { // test reuse int pos = 0; int offset = 0; while (tok.incrementToken()) { cta = (CharTermAttribute) tok.getAttribute(CharTermAttribute.class); pta = (PositionIncrementAttribute) tok.getAttribute(PositionIncrementAttribute.class); oa = (OffsetAttribute) tok.getAttribute(OffsetAttribute.class); System.err.println("'" + cta.toString() + "'"); System.err.println(pta.toString()); System.err.println(oa.toString()); System.err.println("--- pass: " + pass); String expected = expectedStrings[pos]; TestCase.assertEquals("Strings don't match", expected, cta.toString()); TestCase.assertEquals("Positing increment is incorrect", 1, pta.getPositionIncrement()); TestCase.assertEquals("Start offset is incorrect", offset, oa.startOffset()); TestCase.assertEquals("End offset is incorrect", offset + expected.length(), oa.endOffset()); offset += expected.length() + 1; // space after end of sentence pos++; } tok.end(); tok.reset(new StringReader(inputString)); pass++; } } }