// Copyright 2013 Thomas Müller // This file is part of MarMoT, which is licensed under GPLv3. package marmot.morph.cmd; import java.util.LinkedList; import java.util.List; import marmot.core.Sequence; import marmot.core.Token; import marmot.morph.MorphModel; import marmot.morph.MorphOptions; import marmot.morph.Word; import marmot.morph.io.SentenceReader; public class Stats { public static void main(String[] args) { MorphOptions options = new MorphOptions(); options.setPropertiesFromStrings(args); int train_tokens = 0; List<Sequence> train_sentences = new LinkedList<Sequence>(); for (Sequence sequence : new SentenceReader(options.getTrainFile())) { train_sentences.add(sequence); train_tokens += sequence.size(); } MorphModel model = new MorphModel(); model.init(options, train_sentences); System.out.println("Train sentences: " + train_sentences.size()); System.out.println("Train tokens: " + train_tokens); System.out.println("Pos tags: " + (model.getTagTables().get(0).size() - 1)); System.out.println("Morph tags: " + (model.getTagTables().get(1).size() - 1)); boolean has_test_file = options.getTestFile().length() > 0; if (has_test_file) { int test_tokens = 0; int oov_test_tokens = 0; List<Sequence> test_sentences = new LinkedList<Sequence>(); for (Sequence sequence : new SentenceReader(options.getTestFile())) { test_sentences.add(sequence); for (Token token : sequence) { Word word = (Word) token; model.addIndexes(word, false); if (word.getWordFormIndex() < 0) { oov_test_tokens += 1; } } test_tokens += sequence.size(); } System.out.println("OOV rate: " + (oov_test_tokens * 100.) / test_tokens); } } }