// Copyright 2013 Thomas Müller // This file is part of MarMoT, which is licensed under GPLv3. package marmot.morph.cmd; import java.io.IOException; import java.io.Writer; import java.util.LinkedList; import java.util.List; import marmot.core.Sequence; import marmot.core.Token; import marmot.morph.MorphModel; import marmot.morph.MorphOptions; import marmot.morph.MorphTagger; import marmot.morph.Word; import marmot.morph.io.FileOptions; import marmot.morph.io.SentenceReader; import marmot.util.FileUtils; public class Trainer { public static MorphTagger train(MorphOptions options) { long time = System.currentTimeMillis(); List<Sequence> train_sentences = new LinkedList<Sequence>(); SentenceReader reader = new SentenceReader(options.getTrainFile()); if (options.getTagMorph()) reader.getFileOptions().dieIfPropertyIsEmpty( FileOptions.MORPH_INDEX); for (Sequence sentence : reader) { train_sentences.add(sentence); } reader = null; List<Sequence> test_sentences = null; if (!options.getTestFile().isEmpty()) { reader = new SentenceReader(options.getTestFile()); if (options.getTagMorph()) reader.getFileOptions().dieIfPropertyIsEmpty( FileOptions.MORPH_INDEX); test_sentences = new LinkedList<Sequence>(); for (Sequence sentence : reader) { test_sentences.add(sentence); } reader = null; } MorphTagger tagger = (MorphTagger) MorphModel.train(options, train_sentences, test_sentences); if (!options.getModelFile().isEmpty()) FileUtils.saveToFile(tagger, options.getModelFile()); if (options.getVerbose()) System.err.format("Training took: %ds\n", (System.currentTimeMillis() - time) / 1000); return tagger; } public static void main(String[] args) { MorphOptions options = new MorphOptions(); options.setPropertiesFromStrings(args); options.dieIfPropertyIsEmpty(MorphOptions.TRAIN_FILE); options.dieIfPropertyIsEmpty(MorphOptions.MODEL_FILE); MorphTagger tagger = train(options); MorphModel model = (MorphModel) tagger.getModel(); if (!options.getTestFile().isEmpty()) { List<Sequence> sentences = new LinkedList<Sequence>(); SentenceReader reader = new SentenceReader(options.getTestFile()); if (options.getTagMorph()) reader.getFileOptions().dieIfPropertyIsEmpty(FileOptions.MORPH_INDEX); for (Sequence sentence : reader) { for (Token token : sentence) { Word word = (Word) token; model.addIndexes(word, false); } sentences.add(sentence); } if (!options.getPredFile().isEmpty()) { try { Writer writer = FileUtils.openFileWriter( options.getPredFile()); Annotator.annotate(tagger, options.getTestFile(), writer); writer.close(); } catch (IOException e) { throw new RuntimeException(e); } } } } }