// Copyright 2015 Thomas Müller
// This file is part of MarMoT, which is licensed under GPLv3.
package marmot.tokenize.cmd;
import java.io.File;
import java.io.IOException;
import java.io.Writer;
import java.util.Arrays;
import java.util.Collections;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Random;
import marmot.tokenize.RuleBasedTokenizer;
import marmot.tokenize.Tokenizer;
import marmot.tokenize.openlp.OpenNlpConverter;
import marmot.tokenize.openlp.OpenNlpTokenizerTrainer;
import marmot.tokenize.preprocess.Pair;
import marmot.tokenize.preprocess.WikiSelector;
import marmot.tokenize.rules.RuleProvider;
import marmot.util.FileUtils;
import marmot.util.GeneralLevenshteinLattice;
import marmot.util.LevenshteinLattice;
import com.martiansoftware.jsap.FlaggedOption;
import com.martiansoftware.jsap.JSAP;
import com.martiansoftware.jsap.JSAPException;
import com.martiansoftware.jsap.JSAPResult;
public class Experimenter {
public static void main(String[] args) throws IOException, JSAPException {
FlaggedOption opt;
JSAP jsap = new JSAP();
opt = new FlaggedOption("tokenized-file").setRequired(true).setLongFlag(
"tokenized-file");
jsap.registerParameter(opt);
opt = new FlaggedOption("untokenized-file").setRequired(true).setLongFlag(
"untokenized-file");
jsap.registerParameter(opt);
opt = new FlaggedOption("lang").setRequired(true).setLongFlag(
"lang");
jsap.registerParameter(opt);
opt = new FlaggedOption("num-sentences").setLongFlag(
"num-sentences").setStringParser(JSAP.INTEGER_PARSER).setDefault("1000");
jsap.registerParameter(opt);
opt = new FlaggedOption("random-seed").setLongFlag(
"random-seed").setStringParser(JSAP.INTEGER_PARSER).setDefault("42");
jsap.registerParameter(opt);
opt = new FlaggedOption("verbosity").setLongFlag(
"verbosity").setStringParser(JSAP.INTEGER_PARSER).setDefault("0");
jsap.registerParameter(opt);
JSAPResult config = jsap.parse(args);
if (!config.success()) {
for (Iterator<?> errs = config.getErrorMessageIterator(); errs
.hasNext();) {
System.err.println("Error: " + errs.next());
}
System.err.println("Usage: ");
System.err.println(jsap.getUsage());
System.err.println(jsap.getHelp());
System.err.println();
System.exit(1);
}
String lang = config.getString("lang");
String tok_file = config.getString("tokenized-file");
String untok_file = config.getString("untokenized-file");
int num_sentences = config.getInt("num-sentences");
int verbosity = config.getInt("verbosity");
Random random = new Random(config.getInt("random-seed"));
// verbose: 0 no output
// 1 only success
// 2 all messages
// 3 only failure
boolean expand = lang.equalsIgnoreCase("de") || lang.equalsIgnoreCase("es");
List<Pair> pairs = new LinkedList<Pair>();
for (Pair pair : new WikiSelector(untok_file, tok_file,
expand, num_sentences)) {
pairs.add(pair);
}
Collections.shuffle(pairs, random);
List<Pair> trnset = new LinkedList<Pair>();
List<Pair> devset = new LinkedList<Pair>();
List<Pair> tstset = new LinkedList<Pair>();
int index = 0;
for (Pair pair : pairs) {
if (index == 0) {
devset.add(pair);
} else if (index == 1) {
tstset.add(pair);
} else {
trnset.add(pair);
}
index = (index + 1) % 10;
}
runExperiment(trnset, devset, tstset, 1., verbosity, lang);
runExperiment(trnset, devset, tstset, 10., verbosity, lang);
runExperiment(trnset, devset, tstset, 100., verbosity, lang);
}
public static void runExperiment(List<Pair> trnset, List<Pair> devset,
List<Pair> tstset, double percentage, int verbosity, String lang) throws IOException {
int trnset_size = (int) ((percentage * trnset.size()) / 100.);
System.err.format("Trnsize: %d\n", trnset_size);
trnset = trnset.subList(0, trnset_size);
RuleProvider provider = RuleProvider.createRuleProvider(lang);
OpenNlpConverter converter = new OpenNlpConverter(provider);
File opennlp_file = File.createTempFile("openlp_file", ".txt");
opennlp_file.deleteOnExit();
Writer writer = FileUtils.openFileWriter(opennlp_file.getAbsolutePath());
converter.convert(trnset, writer, verbosity);
writer.close();
OpenNlpTokenizerTrainer trainer = new OpenNlpTokenizerTrainer();
Tokenizer tokenizer = trainer.train(opennlp_file.getAbsolutePath());
tokenizer = new RuleBasedTokenizer(tokenizer, provider);
runEvaluation(lang, tokenizer, devset);
}
public static void runEvaluation(String lang, Tokenizer tokenizer, List<Pair> devset) {
int sentence_errors = 0;
int character_errors = 0;
int word_errors = 0;
int sentence_total = 0;
int character_total = 0;
int word_total = 0;
for (Pair pair : devset) {
List<String> predicted_tokens = tokenizer.tokenize(pair.untokenized);
String actual_string = pair.tokenized;
if (lang.equals("es")) {
actual_string = actual_string.replace('_', ' ');
}
String[] array = actual_string.split("\\s+");
List<String> actual_tokens = Arrays.asList(array);
StringBuilder sb = new StringBuilder();
for (String token : predicted_tokens) {
if (sb.length() > 0) {
sb.append(' ');
}
sb.append(token);
}
String predicted_string = sb.toString();
// Sentence Accuracy
if (!predicted_tokens.equals(actual_tokens)) {
sentence_errors += 1;
}
sentence_total += 1;
// Word Accuracy
LevenshteinLattice lattice = new LevenshteinLattice(actual_string, predicted_string);
character_errors += lattice.getDistance();
character_total += actual_string.length();
// Character Accuracy
GeneralLevenshteinLattice<String> glattice = new GeneralLevenshteinLattice<String>(actual_tokens, predicted_tokens);
word_errors += glattice.getDistance();
word_total += actual_tokens.size();
}
System.err.format("Sent Err: %d / %d = %g\n", sentence_errors, sentence_total, sentence_errors * 100. / sentence_total);
System.err.format("Word Err: %d / %d = %g\n", word_errors, word_total, word_errors * 100. / word_total);
System.err.format("Char Err: %d / %d = %g\n", character_errors, character_total, character_errors * 100. / character_total);
}
}