// Copyright 2013 Thomas Müller
// This file is part of MarMoT, which is licensed under GPLv3.
package marmot.morph.cmd;
import java.io.BufferedWriter;
import java.io.FileWriter;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.nio.charset.CharsetEncoder;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import lemming.lemma.Lemmatizer;
import marmot.core.Sequence;
import marmot.morph.MorphDictionary;
import marmot.morph.MorphOptions;
import marmot.morph.MorphTagger;
import marmot.morph.MorphWeightVector;
import marmot.morph.Sentence;
import marmot.morph.Word;
import marmot.morph.io.SentenceReader;
import marmot.util.FileUtils;
import marmot.util.Sys;
public class Annotator {
private static final char SEPARATOR_ = '\t';
private static final String EMPTY_ = "_";
public static void main(String[] args) {
MorphOptions options = new MorphOptions();
options.setPropertiesFromStrings(args);
options.dieIfPropertyIsEmpty(MorphOptions.MODEL_FILE);
options.dieIfPropertyIsEmpty(MorphOptions.PRED_FILE);
options.dieIfPropertyIsEmpty(MorphOptions.TEST_FILE);
MorphTagger tagger = FileUtils.loadFromFile(options.getModelFile());
String lemmatizer_file = options.getLemmatizerFile();
if (!lemmatizer_file.isEmpty()) {
Lemmatizer lemmatizer = FileUtils.loadFromFile(lemmatizer_file);
tagger.setPipeLineLemmatizer(lemmatizer);
}
if (options.getVerbose()) {
System.err.format("Loaded model, currently using %g MB of RAM\n", Sys.getUsedMemoryInMegaBytes());
}
if (!options.getMorphDict().isEmpty()) {
MorphWeightVector vector = (MorphWeightVector) tagger.getWeightVector();
MorphDictionary dict = vector.getMorphDict();
if (dict != null) {
dict.addWordsFromFile(options.getMorphDict());
} else {
System.err.format("Warning: Can't add words from morph. dictionary, because morph. dictionary is null!\n");
}
}
try {
String pred_file = options.getPredFile();
Writer writer;
if (pred_file.isEmpty()) {
writer = new BufferedWriter(new OutputStreamWriter(System.out));
} else {
writer = FileUtils.openFileWriter(pred_file);
}
annotate(tagger, options.getTestFile(), writer);
writer.close();
} catch (IOException e) {
throw new RuntimeException(e);
}
}
public static void annotate(MorphTagger tagger, String text_file, Writer writer) throws IOException {
SentenceReader reader = new SentenceReader(text_file);
for (Sequence sequence : reader) {
annotate(tagger, sequence, writer);
}
}
public static void annotate(MorphTagger tagger, Sequence sequence, Writer writer) throws IOException {
Sentence sentence = (Sentence) sequence;
if (sentence.isEmpty()) {
System.err.println("Warning: Skipping empty sentence!");
return;
}
List<List<String>> lemma_tags;
try {
lemma_tags = tagger.tagWithLemma(sentence);
} catch (OutOfMemoryError e) {
lemma_tags = new ArrayList<List<String>>(sentence.size());
List<String> lemma_tag = Arrays.asList(EMPTY_, EMPTY_);
for (int index = 0; index < sentence.size(); index ++) {
lemma_tags.add(lemma_tag);
}
System.err.format("Warning: Can't tag sentence of length: %d (Not enough memory)!\n", sentence.size());
}
for (int i = 0; i < sentence.size(); i ++) {
Word word = sentence.getWord(i);
List<String> token_lemma_tags = lemma_tags.get(i);
writer.append(Integer.toString(i + 1));
writer.append(SEPARATOR_);
writer.append(word.getWordForm());
// Lemma
writer.append(SEPARATOR_);
writer.append(word.getLemma() != null ? word.getLemma() : EMPTY_);
writer.append(SEPARATOR_);
String lemma = token_lemma_tags.get(0);
writer.append(lemma != null ? lemma : EMPTY_ );
// Pos
writer.append(SEPARATOR_);
writer.append(word.getPosTag() != null ? word.getPosTag() : EMPTY_ );
writer.append(SEPARATOR_);
String pos = token_lemma_tags.get(1);
writer.append(pos);
// Feat
writer.append(SEPARATOR_);
writer.append(word.getMorphTag() != null ? word.getMorphTag() : EMPTY_);
writer.append(SEPARATOR_);
String morph = EMPTY_;
if (2 < token_lemma_tags.size()) {
morph = token_lemma_tags.get(2);
}
writer.append(morph);
writer.append('\n');
}
writer.append('\n');
}
}