// Copyright 2013 Thomas Müller
// This file is part of MarMoT, which is licensed under GPLv3.
package marmot.morph.mapper.latin;
import java.io.IOException;
import java.io.Writer;
import java.util.Calendar;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import marmot.core.Sequence;
import marmot.core.Tagger;
import marmot.morph.MorphModel;
import marmot.morph.MorphOptions;
import marmot.morph.Sentence;
import marmot.morph.Word;
import marmot.morph.mapper.Node;
import marmot.morph.mapper.SyntaxTree;
import marmot.morph.mapper.SyntaxTreeIterator;
import marmot.morph.mapper.latin.LdtMorphTag.Pos;
import marmot.util.Counter;
import marmot.util.FileUtils;
public class ItTreebankConverter {
Counter<String> amb_counter;
Map<String, Counter<String>> amb_map;
LatMorReader latmor_reader_;
public ItTreebankConverter(String latmore_file, String missing_file,
String wrong_file) throws IOException {
ItMorphTag.VERBOSE = false;
latmor_reader_ = new LatMorReader();
try {
latmor_reader_.readLatMorFile(latmore_file);
if (missing_file != null) {
latmor_reader_.readMissingFile(missing_file);
}
if (wrong_file != null) {
latmor_reader_.readWrongFile(wrong_file);
}
} catch (IOException e) {
throw new RuntimeException(e);
}
amb_counter = new Counter<String>();
amb_map = new HashMap<String, Counter<String>>();
}
public void convert(String in_treebank_file, String out_treebank_file)
throws IOException {
List<SyntaxTree> trees = readInitialTrees(in_treebank_file);
replaceUnkownPosTags(trees);
Calendar c = Calendar.getInstance();
String date_string = String.format("-%d-%d-%d", c.get(Calendar.YEAR), c.get(Calendar.MONTH), c.get(Calendar.DAY_OF_MONTH));
out_treebank_file = out_treebank_file + date_string + ".conll";
Writer writer = FileUtils.openFileWriter(out_treebank_file);
for (SyntaxTree tree : trees) {
tree.write(writer);
writer.write('\n');
}
writer.close();
}
static private final Set<Pos> nominals = new HashSet<Pos>();
static {
nominals.add(Pos.p);
nominals.add(Pos.n);
nominals.add(Pos.m);
nominals.add(Pos.a);
}
public Word nodeToWord(Node node, boolean delete_x) {
String form = node.getForm();
String lemma = node.getLemma();
Set<Pos> candidates = latmor_reader_.getPosCandidates(form, lemma);
String[] feats;
LdtMorphTag tag = (LdtMorphTag) node.getMorphTag();
Pos pos = tag.pos_;
form = node.getLemma();
if (nominals.contains(pos)) {
feats = new String[nominals.size() + 1];
int index = 0;
for (Pos p : nominals) {
feats[index++] = p.toString();
}
feats[index++] = node.getDeprel();
} else if (pos == Pos.x && !delete_x) {
candidates.retainAll(nominals);
if (candidates.isEmpty()) {
candidates = nominals;
}
feats = new String[candidates.size() + 1];
int index = 0;
for (Pos p : candidates) {
feats[index++] = p.toString();
}
feats[index++] = node.getDeprel();
} else {
form = form.toUpperCase();
feats = new String[1];
feats[0] = form.toUpperCase();
}
return new Word(form, pos.toString(), null, feats, null, null);
}
public void replaceUnkownPosTags(List<SyntaxTree> trees) {
List<Sequence> sentences = new LinkedList<Sequence>();
for (SyntaxTree tree : trees) {
List<Word> words = new LinkedList<Word>();
for (Node node : tree.getNodes()) {
words.add(nodeToWord(node, true));
}
sentences.add(new Sentence(words));
}
MorphOptions opts = new MorphOptions();
opts.setProperty(MorphOptions.SHAPE, "false");
opts.setProperty(MorphOptions.TAG_MORPH, "false");
opts.setProperty(MorphOptions.ORDER, "2");
opts.setProperty(MorphOptions.VERBOSE, "false");
opts.setProperty(MorphOptions.NUM_ITERATIONS, "10");
Tagger tagger = MorphModel.train(opts, sentences, null);
replaceUnkownPosTagsWithTagger(trees, tagger);
}
public void replaceUnkownPosTagsWithTagger(List<SyntaxTree> trees,
Tagger tagger) {
Counter<Pos> counter = new Counter<Pos>();
for (SyntaxTree tree : trees) {
List<Word> words = new LinkedList<Word>();
boolean contains_unkown = false;
for (Node node : tree.getNodes()) {
words.add(nodeToWord(node, false));
if (((LdtMorphTag) node.getMorphTag()).pos_ == Pos.x) {
contains_unkown = true;
}
}
if (contains_unkown) {
Sentence sentence = new Sentence(words);
List<List<String>> tags = tagger.tag(sentence);
assert tags.size() == tree.getNodes().size();
for (int i = 0; i < tags.size(); i++) {
Node node = tree.getNodes().get(i);
LdtMorphTag tag = (LdtMorphTag) node.getMorphTag();
if (tag.pos_ == Pos.x) {
switch (tags.get(i).get(0)) {
case "n":
tag.pos_ = Pos.n;
break;
case "a":
tag.pos_ = Pos.a;
break;
case "m":
tag.pos_ = Pos.m;
break;
case "p":
tag.pos_ = Pos.p;
break;
default:
System.err.println("Unexpected tag: " + tag);
break;
}
counter.increment(tag.pos_, 1.0);
}
}
}
}
System.err.print("Replacement statistics: ");
System.err.println(counter);
}
private List<SyntaxTree> readInitialTrees(String in_treebank_file) {
List<SyntaxTree> trees = new LinkedList<SyntaxTree>();
SyntaxTreeIterator iterator = new SyntaxTreeIterator(in_treebank_file,
1, 2, 4, 5, 6, 7, false);
while (iterator.hasNext()) {
SyntaxTree tree = iterator.next();
for (Node node : tree.getNodes()) {
String form = LatMorNormalizer.normalize(node.getForm());
node.setForm(form);
String lemma = LatMorNormalizer.normalize(node.getLemma());
int index = lemma.indexOf('^');
if (index >= 0) {
lemma = lemma.substring(0, index);
}
node.setLemma(lemma);
String deprel = node.getDeprel().toLowerCase();
node.setDeprel(deprel);
String fpos = node.getPos();
String cpos = fpos;
if (fpos.length() == 2 && !fpos.equalsIgnoreCase("oq")) {
cpos = fpos.substring(1);
assert Character.isDigit(cpos.charAt(0));
}
String feats = node.getFeats();
String tag_string = String
.format("%s %s %s", cpos, fpos, feats);
LdtMorphTag tag;
if (cpos.equals("Punc")) {
tag = new LdtMorphTag();
tag.pos_ = Pos.u;
} else {
tag = convert(form, lemma, tag_string, deprel);
}
node.setMorphTag(tag);
}
trees.add(tree);
}
System.err.print("Ambiguity stats: ");
System.err.println(amb_counter);
Counter<String> counter = amb_map.get("unk");
if (counter != null) {
System.err.println("Unknown forms: " + counter.size());
// for (Map.Entry<String, Double> entry : counter.sortedEntries()) {
// System.err.format("%s %g\n", entry.getKey(), entry.getValue());
// }
}
return trees;
}
public LdtMorphTag convert(String form, String lemma, String tag_string,
String deprel) {
BrandoliniRules rules = new BrandoliniRules();
ItLdtMapper mapper = new ItLdtMapper();
ItMorphTag it_tag = (ItMorphTag) ItMorphTag.parseString(tag_string);
LdtMorphTag ldt_tag = (LdtMorphTag) mapper.convert(it_tag);
Set<Pos> candidates = rules
.getCandidates(form, lemma, deprel, ldt_tag, it_tag);
if (candidates.isEmpty()) {
candidates = mapper.getPosCandidates(ldt_tag, it_tag);
}
if (candidates.isEmpty()) {
ldt_tag.pos_ = Pos.Undef;
} else if (candidates.size() == 1) {
ldt_tag.pos_ = candidates.iterator().next();
} else {
ldt_tag.pos_ = mergeWithLatMor(form, lemma, candidates, ldt_tag);
}
return ldt_tag;
}
public Pos mergeWithLatMor(String form, String lemma, Set<Pos> candidates,
LdtMorphTag ldt_tag) {
Pos pos = Pos.Undef;
Set<Pos> tags = latmor_reader_.getPosCandidates(form, lemma);
if (!tags.isEmpty()) {
pos = mergeWithLatMor(tags, candidates, ldt_tag, form, false);
if (pos != Pos.Undef) {
return pos;
}
} else {
// System.err.println(form + " " + lemma + " " + latmor_reader_.getLemmas(form));
}
Set<String> lemmas = latmor_reader_.getLemmas(form);
if (lemmas != null && lemmas.contains("_")) {
tags = latmor_reader_.getPosCandidates(form, "_");
pos = mergeWithLatMor(tags, candidates, ldt_tag, form, false);
if (pos != Pos.Undef) {
return pos;
}
} else {
addToCounter("unk", form);
}
if (pos == Pos.Undef) {
pos = ambiguous(candidates);
addToCounter(pos.toString(), form);
}
return pos;
}
private Pos ambiguous(Set<Pos> merged_set) {
if (merged_set.size() > 1) {
if (merged_set.contains(Pos.a) || merged_set.contains(Pos.m)
|| merged_set.contains(Pos.p) || merged_set.contains(Pos.n)) {
assert !merged_set.contains(Pos.c)
|| merged_set.contains(Pos.r);
return Pos.x;
}
}
return Pos.Undef;
}
private Pos mergeWithLatMor(Set<Pos> tags, Set<Pos> candidates,
LdtMorphTag ldt_tag, String form, boolean found_lemma) {
Set<Pos> merged_set = new HashSet<Pos>(candidates);
merged_set.retainAll(tags);
if (merged_set.size() == 1) {
return merged_set.iterator().next();
}
if (!found_lemma)
addToAmbCounter(merged_set, form);
return Pos.Undef;
}
private void addToAmbCounter(Set<Pos> merged_set, String form) {
List<String> list = new LinkedList<String>();
for (Pos pos : merged_set) {
list.add(pos.toString());
}
Collections.sort(list);
if (list.isEmpty()) {
list.add(".");
}
addToCounter(list.toString(), form);
}
private void addToCounter(String string, String form) {
amb_counter.increment(string, 1.0);
Counter<String> forms = amb_map.get(string);
if (forms == null) {
forms = new Counter<String>();
amb_map.put(string, forms);
}
forms.increment(form, 1.0);
}
public static void main(String[] args) throws IOException {
ItTreebankConverter conv = new ItTreebankConverter(args[0], args[1],
args[2]);
conv.convert(args[3], args[4]);
}
}