// Copyright 2013 Thomas Müller
// This file is part of MarMoT, which is licensed under GPLv3.
package marmot.morph.mapper.latin;
import java.io.BufferedReader;
import java.io.IOException;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import marmot.morph.mapper.MorphTag;
import marmot.morph.mapper.latin.ItMorphTag.CaseNumber;
import marmot.morph.mapper.latin.ItMorphTag.FlexionalCategory;
import marmot.morph.mapper.latin.ItMorphTag.NominalsDegree;
import marmot.morph.mapper.latin.LdtMorphTag.Case;
import marmot.morph.mapper.latin.LdtMorphTag.Degree;
import marmot.morph.mapper.latin.LdtMorphTag.Gender;
import marmot.morph.mapper.latin.LdtMorphTag.Mood;
import marmot.morph.mapper.latin.LdtMorphTag.Person;
import marmot.morph.mapper.latin.LdtMorphTag.Pos;
import marmot.morph.mapper.latin.LdtMorphTag.Tense;
import marmot.morph.mapper.latin.LdtMorphTag.Voice;
import marmot.util.FileUtils;
public class ItLdtMapper {
public MorphTag convert(ItMorphTag it_tag) {
LdtMorphTag tag = new LdtMorphTag();
if (it_tag.case_number == CaseNumber.G) {
return tag;
}
setPerson(tag, it_tag);
setNumber(tag, it_tag);
setTense(tag, it_tag);
setMoodAndVoice(tag, it_tag);
setGender(tag, it_tag);
setCase(tag, it_tag);
setDegree(tag, it_tag);
return tag;
}
private void setDegree(LdtMorphTag ldt_tag, ItMorphTag it_tag) {
switch (it_tag.participials_degree_) {
case Two:
ldt_tag.degree_ = Degree.c;
break; // Comparative 2
case Three:
ldt_tag.degree_ = Degree.s;
break; // Superlative 3
case Undef:
case One:
ldt_tag.degree_ = Degree.Undef;
break; // 6 Participials-Degree Positive 1
}
}
private void setCase(LdtMorphTag ldt_tag, ItMorphTag it_tag) {
switch (it_tag.case_number) {
case A: // Singular Nominative A
case J: // Plural Nominative J
ldt_tag.case_ = Case.n;
break;
case B: // Singular Genitive B
case K: // Plural Genitive K
ldt_tag.case_ = Case.g;
break;
case C: // Singular Dative C
case L: // Plural Dative L
ldt_tag.case_ = Case.d;
break;
case D: // Singular Accusative D
case M: // Plural Accusative M
ldt_tag.case_ = Case.a;
break;
case E:
case N:
ldt_tag.case_ = Case.v;
break;
case F: // Singular Ablative F
case O: // Plural Ablative O
ldt_tag.case_ = Case.b;
break;
case G: // Adverbial G
case H: // Casus “plurimus” H
case Undef: // None -
default:
ldt_tag.case_ = Case.Undef;
}
}
public Set<Pos> getPosCandidates(LdtMorphTag ldt_tag, ItMorphTag it_tag) {
Set<Pos> candidates = new HashSet<Pos>();
if (it_tag.case_number == CaseNumber.G) {
candidates.add(Pos.d);
return candidates;
}
switch (it_tag.flexional_type_) {
case One:
if (it_tag.nominals_degree_ == NominalsDegree.One
|| it_tag.nominals_degree_ == NominalsDegree.Undef) {
candidates.add(Pos.n);
candidates.add(Pos.a);
candidates.add(Pos.m);
candidates.add(Pos.p);
} else {
candidates.add(Pos.a);
}
break;
case Two:
candidates.add(Pos.t);
break;
case Three:
candidates.add(Pos.v);
break;
case Four:
if (it_tag.flexional_category_ == FlexionalCategory.O) {
candidates.add(Pos.c);
candidates.add(Pos.d);
} else {
candidates.add(Pos.r);
}
break;
case Five:
if (it_tag.flexional_category_ == FlexionalCategory.G) {
candidates.add(Pos.m);
}
break;
default:
break;
}
return candidates;
}
private void setGender(LdtMorphTag ldt_tag, ItMorphTag it_tag) {
switch (it_tag.gender_number_person_) {
case One:
ldt_tag.gender_ = Gender.m;
break;
case Two:
ldt_tag.gender_ = Gender.f;
break;
case Three:
ldt_tag.gender_ = Gender.n;
break;
default:
ldt_tag.gender_ = Gender.Undef;
break;
}
}
private void setMoodAndVoice(LdtMorphTag ldt_tag, ItMorphTag it_tag) {
switch (it_tag.mood_) {
case A:
ldt_tag.voice_ = Voice.a;
ldt_tag.mood_ = Mood.i;
break; // Active indicative A
case J:
ldt_tag.voice_ = Voice.p;
ldt_tag.mood_ = Mood.i;
break; // Pass/Dep indicative J
case B:
ldt_tag.voice_ = Voice.a;
ldt_tag.mood_ = Mood.s;
break; // Active subjunctive B
case K:
ldt_tag.voice_ = Voice.p;
ldt_tag.mood_ = Mood.s;
break; // Pass/Dep subjunctive K
case C:
ldt_tag.voice_ = Voice.a;
ldt_tag.mood_ = Mood.m;
break; // Active imperative C
case L:
ldt_tag.voice_ = Voice.p;
ldt_tag.mood_ = Mood.m;
break; // Pass/Dep imperative L
case D:
ldt_tag.voice_ = Voice.a;
ldt_tag.mood_ = Mood.p;
break; // Active participle D
case M:
ldt_tag.voice_ = Voice.p;
ldt_tag.mood_ = Mood.p;
break; // Pass/Dep Participle M
case E:
ldt_tag.voice_ = Voice.a;
ldt_tag.mood_ = Mood.d;
break; // Active gerund E
case N:
ldt_tag.voice_ = Voice.p;
ldt_tag.mood_ = Mood.d;
break; // Passive Gerund N
case O:
ldt_tag.voice_ = Voice.p;
ldt_tag.mood_ = Mood.g;
break; // Pass/Dep gerundive O
case G:
ldt_tag.voice_ = Voice.a;
ldt_tag.mood_ = Mood.u;
break; // Active supine G
case P:
ldt_tag.voice_ = Voice.p;
ldt_tag.mood_ = Mood.u;
break; // Pass/Dep supine P
case H:
ldt_tag.voice_ = Voice.a;
ldt_tag.mood_ = Mood.n;
break; // Active infinitive H
case Q:
ldt_tag.voice_ = Voice.p;
ldt_tag.mood_ = Mood.n;
break; // Pass/Dep infinitive Q
case Undef:
ldt_tag.voice_ = Voice.Undef;
ldt_tag.mood_ = Mood.Undef;
break; // None -
}
}
private void setTense(LdtMorphTag ldt_tag, ItMorphTag it_tag) {
switch (it_tag.tense_) {
case One:
ldt_tag.tense_ = Tense.p;
break;
case Two:
ldt_tag.tense_ = Tense.i;
break;
case Three:
ldt_tag.tense_ = Tense.f;
break;
case Four:
ldt_tag.tense_ = Tense.r;
break;
case Five:
ldt_tag.tense_ = Tense.l;
break;
case Six:
ldt_tag.tense_ = Tense.t;
break;
default:
ldt_tag.tense_ = Tense.Undef;
break;
}
}
private void setNumber(LdtMorphTag ldt_tag, ItMorphTag it_tag) {
switch (it_tag.gender_number_person_) {
case Four:
case Five:
case Six:
ldt_tag.number_ = LdtMorphTag.Number.s;
break;
case Seven:
case Eight:
case Nine:
ldt_tag.number_ = LdtMorphTag.Number.p;
break;
default:
switch (it_tag.case_number) {
case A:
case B:
case C:
case D:
case E:
case F:
ldt_tag.number_ = LdtMorphTag.Number.s;
break;
case O:
case J:
case K:
case M:
case L:
case N:
ldt_tag.number_ = LdtMorphTag.Number.p;
break;
default:
ldt_tag.number_ = LdtMorphTag.Number.Undef;
}
break;
}
}
private void setPerson(LdtMorphTag ldt_tag, ItMorphTag it_tag) {
switch (it_tag.gender_number_person_) {
case Four:
case Seven:
ldt_tag.person_ = Person.first;
break;
case Five:
case Eight:
ldt_tag.person_ = Person.second;
break;
case Six:
case Nine:
ldt_tag.person_ = Person.third;
break;
default:
ldt_tag.person_ = Person.Undef;
}
}
public static void main(String[] args) throws IOException {
BufferedReader reader = FileUtils.openFile("/mount/projekte/sfb-732/d4/users/muellets/treebanks/latin/ldt-1.5/tagwords.txt");
Map<String, Set<String>> ldt_word_map = new HashMap<String, Set<String>>();
Map<String, Integer> ldt_tag_vocab = new HashMap<String, Integer>();
Map<String, Set<String>> ldt_morph_pos_map = new HashMap<String, Set<String>>();
while (reader.ready()) {
String line = reader.readLine().trim();
if (!line.isEmpty()) {
String[] tokens = line.split("\\s+");
int count = Integer.parseInt(tokens[0]);
String form = tokens[1];
String pos = tokens[2];
String morph = tokens[3];
Set<String> tags = ldt_morph_pos_map.get(morph);
if (tags == null) {
tags = new HashSet<String>();
ldt_morph_pos_map.put(morph, tags);
}
tags.add(pos);
tags = ldt_word_map.get(morph);
if (tags == null) {
tags = new HashSet<String>();
ldt_word_map.put(morph, tags);
}
tags.add(form);
Integer word_count = ldt_tag_vocab.get(morph);
if (word_count == null)
word_count = 0;
word_count += count;
ldt_tag_vocab.put(morph, word_count);
}
}
reader.close();
reader = FileUtils.openFile("/mount/projekte/sfb-732/d4/users/muellets/treebanks/latin/ittb/tagwords.txt");
ItLdtMapper mapper = new ItLdtMapper();
ItMorphTag.VERBOSE = false;
Map<String, Set<String>> it_word_map = new HashMap<String, Set<String>>();
Set<String> set = new HashSet<String>();
while (reader.ready()) {
String line = reader.readLine().trim();
if (!line.isEmpty()) {
String[] tokens = line.split("\\s+");
String form = tokens[1];
String it_tag_string = tokens[2] + " " + tokens[3] + " "
+ tokens[4];
ItMorphTag it_tag = (ItMorphTag) ItMorphTag
.parseString(it_tag_string);
LdtMorphTag ldt_tag = (LdtMorphTag) mapper.convert(it_tag);
String tag = ldt_tag.toString().substring(1);
Set<String> tags = it_word_map.get(form);
if (tags == null) {
tags = new HashSet<String>();
it_word_map.put(form, tags);
}
tags.add(tag + "#" + tokens[4]);
set.add(tag);
}
}
// System.out.println(it_word_map);
int total = 0;
int unseen = 0;
Map<String, Integer> confusion_map = new HashMap<String, Integer>();
for (Map.Entry<String, Set<String>> entry : ldt_word_map.entrySet()) {
String tag = entry.getKey();
int count = ldt_tag_vocab.get(tag);
total += count;
if (!set.contains(tag)) {
Set<String> tag_set = new HashSet<String>();
for (String form : entry.getValue()) {
Set<String> tags = it_word_map.get(form);
if (tags != null) {
tag_set.addAll(tags);
}
}
for (String compound_tag : tag_set) {
String[] tags = compound_tag.split("#");
if (distance(tag, tags[0], confusion_map) < 2) {
// System.out.format("%d %s | %s <- %s\n", count, tag,
// tags[0], tags[1]);
}
}
// if (!tag_set.isEmpty())
// System.out.println(count + " " + tag + " " + tag_set);
unseen += count;
}
}
for (Map.Entry<String, Integer> entry : confusion_map.entrySet()) {
System.err.format("%s %d\n", entry.getKey(), entry.getValue());
}
System.err.format("%d / %d = %g", unseen, total, unseen * 100. / total);
reader.close();
}
private static int distance(String tag, String string,
Map<String, Integer> map) {
assert tag.length() == string.length();
int dist = 0;
for (int index = 0; index < tag.length(); index++) {
if (tag.charAt(index) != string.charAt(index)) {
String siganture = String.format("%d %c %c", index + 2,
tag.charAt(index), string.charAt(index));
Integer count = map.get(siganture);
if (count == null) {
count = 0;
}
map.put(siganture, count + 1);
dist++;
}
}
return dist;
}
}