// Copyright 2014 Thomas Müller
// This file is part of MarMoT, which is licensed under GPLv3.
package marmot.tokenize.openlp;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.util.Arrays;
import java.util.List;
import opennlp.tools.tokenize.Tokenizer;
import opennlp.tools.tokenize.TokenizerME;
import opennlp.tools.tokenize.TokenizerModel;
import marmot.tokenize.AbstractTokenizer;
public class OpenNlpTokenizer extends AbstractTokenizer {
transient TokenizerModel model_;
private static final long serialVersionUID = 1L;
public OpenNlpTokenizer(TokenizerModel model) {
model_ = model;
}
@Override
public List<String> tokenize(String untokenized) {
Tokenizer tokenizer = (Tokenizer) new TokenizerME(model_);
return Arrays.asList(tokenizer.tokenize(untokenized));
}
private void writeObject(ObjectOutputStream oos) throws IOException {
oos.defaultWriteObject();
model_.serialize(oos);
}
private void readObject(ObjectInputStream ois)
throws ClassNotFoundException, IOException {
ois.defaultReadObject();
model_ = new TokenizerModel(ois);
}
}