package org.apache.lucene.analysis.jate;
import opennlp.tools.sentdetect.SentenceDetector;
import opennlp.tools.sentdetect.SentenceDetectorME;
import opennlp.tools.sentdetect.SentenceModel;
import opennlp.tools.tokenize.TokenizerME;
import opennlp.tools.tokenize.TokenizerModel;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.util.ResourceLoader;
import org.apache.lucene.analysis.util.ResourceLoaderAware;
import org.apache.lucene.analysis.util.TokenizerFactory;
import org.apache.lucene.util.AttributeFactory;
import java.io.IOException;
import java.util.Map;
import java.util.concurrent.Exchanger;
public class OpenNLPTokenizerFactory extends TokenizerFactory implements ResourceLoaderAware {
private SentenceDetector sentenceOp = null;
private String sentenceModelFile = null;
private opennlp.tools.tokenize.Tokenizer tokenizerOp = null;
private String tokenizerModelFile = null;
private String parChunkingClass=null;
private ParagraphChunker paragraphChunker;
/**
* Creates a new StandardTokenizerFactory
*/
public OpenNLPTokenizerFactory(Map<String, String> args) {
super(args);
sentenceModelFile = args.get("sentenceModel");
tokenizerModelFile = args.get("tokenizerModel");
parChunkingClass=args.get("paragraphChunker-class");
}
@Override
public Tokenizer create(AttributeFactory factory) {
OpenNLPTokenizer tokenizer;
if(paragraphChunker==null)
tokenizer= new OpenNLPTokenizer(factory, sentenceOp, tokenizerOp);
else
tokenizer=new OpenNLPTokenizer(factory, sentenceOp, tokenizerOp, paragraphChunker);
return tokenizer;
}
@Override
public void inform(ResourceLoader loader) throws IOException {
if(sentenceModelFile!=null) {
sentenceOp = new SentenceDetectorME(new SentenceModel(
loader.openResource(sentenceModelFile)));
}
if(tokenizerModelFile==null)
throw new IOException("Parameter 'tokenizerModle' is required, but is invalid:"+tokenizerModelFile);
tokenizerOp = new TokenizerME(new TokenizerModel(
loader.openResource(tokenizerModelFile)
));
if(parChunkingClass!=null) {
try {
Class c = Class.forName(parChunkingClass);
Object o = c.newInstance();
paragraphChunker = (ParagraphChunker) o;
}catch (Exception e){
throw new IOException(e);
}
}
}
}