/* * Copyright 2008-2011 Grant Ingersoll, Thomas Morton and Drew Farris * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ------------------- * To purchase or learn more about Taming Text, by Grant Ingersoll, Thomas Morton and Drew Farris, visit * http://www.manning.com/ingersoll */ package com.tamingtext.sentences; import java.io.File; import java.io.FileInputStream; import java.io.InputStream; import java.text.BreakIterator; import java.util.ArrayList; import java.util.List; import java.util.Locale; import opennlp.tools.sentdetect.SentenceDetector; import opennlp.tools.sentdetect.SentenceDetectorME; import opennlp.tools.sentdetect.SentenceModel; import org.junit.Test; import com.tamingtext.TamingTextTestJ4; public class SentenceDetectionTest extends TamingTextTestJ4 { @Test public void testBreakIterator() { //<start id="sentDetect"/> BreakIterator sentIterator = BreakIterator.getSentenceInstance(Locale.US); String testString = "This is a sentence. It has fruits, vegetables," + " etc. but does not have meat. Mr. Smith went to Washington."; sentIterator.setText(testString); int start = sentIterator.first(); int end = -1; List<String> sentences = new ArrayList<String>(); while ((end = sentIterator.next()) != BreakIterator.DONE) { String sentence = testString.substring(start, end); start = end; sentences.add(sentence); System.out.println("Sentence: " + sentence); } //<end id="sentDetect"/> } @Test public void testOpenNLP() throws Exception { File modelDir = getModelDir(); //<start id="openSentDetect"/> //... Setup the models File modelFile = new File(modelDir, "en-sent.bin"); InputStream modelStream = new FileInputStream(modelFile); SentenceModel model = new SentenceModel(modelStream); SentenceDetector detector = //<co id="openSentDetect.co.detect"/> new SentenceDetectorME(model); String testString = "This is a sentence. It has fruits, vegetables," + " etc. but does not have meat. Mr. Smith went to Washington."; String[] result = detector.sentDetect(testString); //<co id="openSentDetect.co.run"/> for (int i = 0; i < result.length; i++) { System.out.println("Sentence: " + result[i]); } /*<calloutlist> <callout arearefs="openSentDetect.co.detect"><para>Create the <command>SentenceDetector</command> with the en-sent.bin model</para></callout> <callout arearefs="openSentDetect.co.run"><para>Invoke the detection process</para></callout> </calloutlist>*/ //<end id="openSentDetect"/> assertTrue("result Size: " + result.length + " is not: " + 3, result.length == 3); } }