package semanticMarkup.ling.transform.lib; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import java.util.LinkedList; import java.util.List; import opennlp.tools.sentdetect.SentenceDetectorME; import opennlp.tools.sentdetect.SentenceModel; import opennlp.tools.util.InvalidFormatException; import semanticMarkup.ling.Token; import semanticMarkup.ling.transform.ITokenizer; public class OpenNLPSentencesTokenizer implements ITokenizer{ private SentenceDetectorME mySentenceDetector; public OpenNLPSentencesTokenizer (String openNLPSentenceDetectorDir) { InputStream sentModelIn; try { sentModelIn = new FileInputStream(openNLPSentenceDetectorDir); SentenceModel model = new SentenceModel(sentModelIn); this.mySentenceDetector = new SentenceDetectorME(model); } catch (FileNotFoundException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (InvalidFormatException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } @Override public List<Token> tokenize(String text) { String[] sentenceArray = this.mySentenceDetector.sentDetect(text); List<Token> sentences = new LinkedList<Token>(); for (String sentence: sentenceArray) { Token sentenceElement = new Token(sentence); sentences.add(sentenceElement); } return sentences; } }