// Copyright 2014 Thomas Müller // This file is part of MarMoT, which is licensed under GPLv3. package marmot.tokenize.openlp; import java.io.FileInputStream; import java.io.IOException; import java.nio.charset.Charset; import java.util.regex.Pattern; import marmot.tokenize.Tokenizer; import opennlp.model.TrainUtil; import opennlp.tools.dictionary.Dictionary; import opennlp.tools.tokenize.TokenSample; import opennlp.tools.tokenize.TokenSampleStream; import opennlp.tools.tokenize.TokenizerFactory; import opennlp.tools.tokenize.TokenizerME; import opennlp.tools.tokenize.TokenizerModel; import opennlp.tools.util.ObjectStream; import opennlp.tools.util.PlainTextByLineStream; import opennlp.tools.util.TrainingParameters; public class OpenNlpTokenizerTrainer { public final static int CUTOFF = 1; public Tokenizer train(String path) throws IOException { TokenizerModel model; ObjectStream<String> line_stream = new PlainTextByLineStream( new FileInputStream(path), Charset.forName("UTF-8")); ObjectStream<TokenSample> samples = new TokenSampleStream(line_stream); String lang_code = ""; Dictionary dict = null; Pattern alpha_numeric_pattern = null; TokenizerFactory factory = new TokenizerFactory(lang_code, dict, true, alpha_numeric_pattern); TrainingParameters params = TrainingParameters.defaultParams(); params.put(TrainUtil.CUTOFF_PARAM, Integer.toString(CUTOFF)); try { model = TokenizerME.train(samples, factory, params); } finally { samples.close(); } return new OpenNlpTokenizer(model); } }