package uk.ac.shef.dcs.jate.nlp.opennlp; import opennlp.tools.sentdetect.SentenceDetector; import opennlp.tools.sentdetect.SentenceDetectorME; import opennlp.tools.sentdetect.SentenceModel; import opennlp.tools.util.Span; import uk.ac.shef.dcs.jate.nlp.SentenceSplitter; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; import java.util.List; import org.apache.log4j.Logger; public class SentenceSplitterOpenNLP implements SentenceSplitter { private static Logger LOG = Logger.getLogger(SentenceSplitterOpenNLP.class.getName()); protected SentenceDetector sentenceDetector; public SentenceSplitterOpenNLP(InputStream model) throws IOException { LOG.info("Initializing OpenNLP sentence splitter..."); sentenceDetector = new SentenceDetectorME(new SentenceModel(model)); } public SentenceSplitterOpenNLP(String modelFile) throws IOException { LOG.info("Initializing OpenNLP sentence splitter..."); FileInputStream modelFileStream = new FileInputStream(modelFile); try { sentenceDetector = new SentenceDetectorME(new SentenceModel(modelFileStream)); } finally { modelFileStream.close(); } } public SentenceSplitterOpenNLP(File modelFile) throws IOException { LOG.info("Initializing OpenNLP sentence splitter..."); sentenceDetector = new SentenceDetectorME(new SentenceModel(modelFile)); } public List<int[]> split(String text) { Span[] offsets = sentenceDetector.sentPosDetect(text); List<int[]> rs = new ArrayList<>(); for (Span s : offsets) { rs.add(new int[]{s.getStart(), s.getEnd()}); } return rs; } }