// Copyright 2015 Thomas Müller // This file is part of MarMoT, which is licensed under GPLv3. package marmot.tokenize.cmd; import java.io.File; import java.io.IOException; import java.io.Writer; import java.util.Iterator; import marmot.tokenize.RuleBasedTokenizer; import marmot.tokenize.Tokenizer; import marmot.tokenize.openlp.OpenNlpConverter; import marmot.tokenize.openlp.OpenNlpTokenizerTrainer; import marmot.tokenize.preprocess.Pair; import marmot.tokenize.preprocess.WikiSelector; import marmot.tokenize.rules.RuleProvider; import marmot.util.FileUtils; import com.martiansoftware.jsap.FlaggedOption; import com.martiansoftware.jsap.JSAP; import com.martiansoftware.jsap.JSAPException; import com.martiansoftware.jsap.JSAPResult; public class Trainer { public static void main(String[] args) throws IOException, JSAPException { FlaggedOption opt; JSAP jsap = new JSAP(); opt = new FlaggedOption("tokenized-file").setRequired(true).setLongFlag( "tokenized-file"); jsap.registerParameter(opt); opt = new FlaggedOption("untokenized-file").setRequired(true).setLongFlag( "untokenized-file"); jsap.registerParameter(opt); opt = new FlaggedOption("model-file").setRequired(true).setLongFlag( "model-file"); jsap.registerParameter(opt); opt = new FlaggedOption("lang").setRequired(true).setLongFlag( "lang"); jsap.registerParameter(opt); opt = new FlaggedOption("num-sentences").setRequired(true).setLongFlag( "num-sentences").setStringParser(JSAP.INTEGER_PARSER).setDefault("1000"); jsap.registerParameter(opt); opt = new FlaggedOption("verbose").setRequired(true).setLongFlag( "verbose").setStringParser(JSAP.INTEGER_PARSER).setDefault("0"); jsap.registerParameter(opt); JSAPResult config = jsap.parse(args); if (!config.success()) { for (Iterator<?> errs = config.getErrorMessageIterator(); errs .hasNext();) { System.err.println("Error: " + errs.next()); } System.err.println("Usage: "); System.err.println(jsap.getUsage()); System.err.println(jsap.getHelp()); System.err.println(); System.exit(1); } String lang = config.getString("lang"); String tok_file = config.getString("tokenized-file"); String untok_file = config.getString("untokenized-file"); String model_file = config.getString("model-file"); int num_sentences = config.getInt("num-sentences"); int verbose = config.getInt("verbose"); // verbose: 0 no output // 1 only success // 2 all messages // 3 only failure boolean expand = lang.equalsIgnoreCase("de") || lang.equalsIgnoreCase("es"); Iterable<Pair> pairs = new WikiSelector(untok_file, tok_file, expand, num_sentences); RuleProvider provider = RuleProvider.createRuleProvider(lang); OpenNlpConverter converter = new OpenNlpConverter(provider); System.out.println("Starting alignment for '"+lang+"' textset"); File opennlp_file = File.createTempFile("openlp_file", ".txt"); Writer writer = FileUtils.openFileWriter(opennlp_file.getAbsolutePath()); converter.convert(pairs, writer, verbose); writer.close(); OpenNlpTokenizerTrainer trainer = new OpenNlpTokenizerTrainer(); Tokenizer tokenizer = trainer.train(opennlp_file.getAbsolutePath()); tokenizer = new RuleBasedTokenizer(tokenizer, provider); tokenizer.saveToFile(model_file); } }