package semanticMarkup.ling.transform.lib; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import java.util.LinkedList; import java.util.List; import opennlp.tools.tokenize.TokenizerME; import opennlp.tools.tokenize.TokenizerModel; import opennlp.tools.util.InvalidFormatException; import semanticMarkup.ling.Token; import semanticMarkup.ling.transform.ITokenizer; public class OpenNLPTokenizer implements ITokenizer{ private TokenizerME myTokenizer; public OpenNLPTokenizer(String OpenNLPTokenizerDir) { // Get OpenNLP tokenizer InputStream tokenModelIn; try { tokenModelIn = new FileInputStream(OpenNLPTokenizerDir); TokenizerModel model = new TokenizerModel(tokenModelIn); this.myTokenizer = new TokenizerME(model); } catch (FileNotFoundException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (InvalidFormatException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } @Override public List<Token> tokenize(String text) { // TODO Auto-generated method stub String[] tempTokens = this.myTokenizer.tokenize(text); List<Token> tokens = new LinkedList<Token>(); for (int i=0;i<tempTokens.length;i++){ Token token = new Token(tempTokens[i]); tokens.add(token); } return tokens; } }