// Copyright 2013 Thomas Müller
// This file is part of MarMoT, which is licensed under GPLv3.
package marmot.morph.mapper.latin;
import java.io.BufferedReader;
import java.io.IOException;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import marmot.morph.mapper.latin.LdtMorphTag.Case;
import marmot.morph.mapper.latin.LdtMorphTag.Degree;
import marmot.morph.mapper.latin.LdtMorphTag.Gender;
import marmot.morph.mapper.latin.LdtMorphTag.Mood;
import marmot.morph.mapper.latin.LdtMorphTag.Person;
import marmot.morph.mapper.latin.LdtMorphTag.Pos;
import marmot.morph.mapper.latin.LdtMorphTag.Tense;
import marmot.morph.mapper.latin.LdtMorphTag.Voice;
import marmot.util.FileUtils;
import marmot.util.LineIterator;
public class LatMorReader {
Map<String, Map<String, Set<LdtMorphTag>>> dict_;
public LatMorReader() {
dict_ = new HashMap<String, Map<String, Set<LdtMorphTag>>>();
}
public LatMorReader(Map<String, Map<String, Set<LdtMorphTag>>> dict) {
dict_ = dict;
}
public void readLatMorFile(String filename) throws IOException {
BufferedReader reader = FileUtils.openFile(filename);
String line = null;
while (reader.ready()) {
if (line == null)
line = reader.readLine().trim();
if (!line.isEmpty()) {
assert line.startsWith("> ");
String form = LatMorNormalizer.normalize(line.substring(2));
Map<String, Set<LdtMorphTag>> readings = dict_.get(form);
// assert readings == null;
boolean insert = false;
if (readings == null) {
insert = true;
readings = new HashMap<String, Set<LdtMorphTag>>();
}
line = readReadings(reader, readings);
if (insert && !readings.isEmpty())
dict_.put(form, readings);
}
}
reader.close();
}
public void readMissingFile(String filename) throws IOException {
LineIterator iterator = new LineIterator(filename);
while (iterator.hasNext()) {
List<String> line = iterator.next();
if (line.size() > 2) {
String form = LatMorNormalizer.normalize(line.get(0));
Set<LdtMorphTag> tags = new HashSet<LdtMorphTag>();
for (String tag_string : line.subList(2, line.size())) {
LdtMorphTag tag = new LdtMorphTag();
switch (tag_string) {
case "aj":
tag.pos_ = Pos.a;
break;
case "su":
tag.pos_ = Pos.n;
break;
case "np":
tag.pos_ = Pos.n;
break;
case "pn":
tag.pos_ = Pos.p;
break;
case "nu":
tag.pos_ = Pos.m;
break;
default:
System.err.println("Unknown tag: " + tag_string);
}
if (tag.pos_ != Pos.Undef) {
tags.add(tag);
}
}
if (tags.size() > 0) {
Map<String, Set<LdtMorphTag>> readings = dict_.get(form);
if (readings == null) {
readings = new HashMap<String, Set<LdtMorphTag>>();
dict_.put(form, readings);
}
readings.put("_", tags);
}
}
}
}
public String readReadings(BufferedReader reader,
Map<String, Set<LdtMorphTag>> readings) throws IOException {
String line = null;
while (reader.ready()) {
line = reader.readLine().trim();
if (line.startsWith(">"))
break;
if (line.startsWith("no result for ")) {
continue;
}
parseReading(line, readings);
}
return line;
}
public void parseReading(String line, Map<String, Set<LdtMorphTag>> readings) {
int morph_start = line.indexOf('<');
if (morph_start < 0) {
throw new RuntimeException("Invalid reading: " + line);
}
String lemma = LatMorNormalizer.normalize(line
.substring(0, morph_start).replace("-", ""));
String morph_tag_string = line.substring(morph_start);
Set<LdtMorphTag> set = readings.get(lemma);
if (set == null) {
set = new HashSet<LdtMorphTag>();
readings.put(lemma, set);
}
set.add(parseMorpTagString(morph_tag_string));
}
public LdtMorphTag parseMorpTagString(String morph_tag_string) {
LdtMorphTag tag = new LdtMorphTag();
String[] features = morph_tag_string.split("[<>]");
for (String feature : features) {
if (!feature.isEmpty()) {
setFeature(tag, feature);
}
}
postProcess(tag);
return tag;
}
public void postProcess(LdtMorphTag tag) {
if (tag.pos_ == Pos.p) {
tag.person_ = Person.Undef;
}
if (tag.pos_ == Pos.r) {
tag.case_ = Case.Undef;
}
}
public static void setFeature(LdtMorphTag tag, String feature) {
switch (feature) {
// POS
case "N":
case "PN":
tag.pos_ = Pos.n;
break;
case "V":
tag.pos_ = Pos.v;
if (tag.mood_ == Mood.p)
tag.pos_ = Pos.t;
break;
case "ADJ":
tag.pos_ = Pos.a;
break;
case "PREP":
tag.pos_ = Pos.r;
break;
case "ADV":
tag.pos_ = Pos.d;
break;
case "CONJ":
tag.pos_ = Pos.c;
break;
case "PRO":
tag.pos_ = Pos.p;
break;
case "NUM":
tag.pos_ = Pos.m;
break;
// Person
case "1":
tag.person_ = Person.first;
break;
case "2":
tag.person_ = Person.second;
break;
case "3":
tag.person_ = Person.third;
break;
// Number
case "sg":
tag.number_ = LdtMorphTag.Number.s;
break;
case "pl":
tag.number_ = LdtMorphTag.Number.p;
break;
// Tense
case "pres":
tag.tense_ = Tense.p;
break;
case "imperf":
tag.tense_ = Tense.i;
break;
case "perf":
tag.tense_ = Tense.r;
break;
case "pqperf":
tag.tense_ = Tense.l;
break;
case "futureII":
tag.tense_ = Tense.t;
break;
case "future":
case "futureI":
tag.tense_ = Tense.f;
break;
// Mood
case "ind":
tag.mood_ = Mood.i;
break;
case "cond":
tag.mood_ = Mood.s;
case "inf":
tag.mood_ = Mood.n;
break;
case "imp":
tag.mood_ = Mood.m;
break;
case "part":
tag.mood_ = Mood.p;
if (tag.pos_ == Pos.v)
tag.pos_ = Pos.t;
break;
case "gerund":
tag.mood_ = Mood.d;
break;
case "gerundivum":
tag.mood_ = Mood.g;
break;
case "supinI":
case "supinII":
tag.mood_ = Mood.u;
break;
// Voice
case "active":
tag.voice_ = Voice.a;
break;
case "passive":
case "deponens":
tag.voice_ = Voice.p;
break;
// Gender
case "masc":
tag.gender_ = Gender.m;
break;
case "fem":
tag.gender_ = Gender.f;
break;
case "neut":
tag.gender_ = Gender.n;
break;
// Case
case "nom":
tag.case_ = Case.n;
break;
case "gen":
tag.case_ = Case.g;
break;
case "dat":
tag.case_ = Case.d;
break;
case "acc":
tag.case_ = Case.a;
break;
case "abl":
tag.case_ = Case.b;
break;
case "voc":
tag.case_ = Case.v;
break;
// LOCATIVE
// Degree
case "superlative":
tag.degree_ = Degree.s;
break;
case "comparative":
tag.degree_ = Degree.c;
break;
case "INTJ":
tag.pos_ = LdtMorphTag.Pos.i;
case "subj":
case "positive":
case "conj":
case "alt":
case "coord":
case "dem":
case "indef":
case "subord":
case "dist":
case "card":
case "rel":
case "quest":
case "ord":
case "poss":
case "refl":
case "pers":
case "dig":
case "adj":
break;
default:
throw new RuntimeException("Unknown feature: " + feature);
}
}
// private Set<Pos> getPosCandidates(String form) {
// return getPosCandidates(form, null);
// }
Set<Pos> getPosCandidates(String form, String lemma) {
Map<String, Set<LdtMorphTag>> lemmas = dict_.get(form);
Set<Pos> tags = new HashSet<Pos>();
if (lemmas != null) {
Set<LdtMorphTag> ldt_tags;
if (lemma != null) {
ldt_tags = lemmas.get(lemma);
} else {
ldt_tags = new HashSet<LdtMorphTag>();
for (Set<LdtMorphTag> ldt_tag_set : lemmas.values()) {
ldt_tags.addAll(ldt_tag_set);
}
}
if (ldt_tags != null) {
for (LdtMorphTag ldt_tag : ldt_tags) {
tags.add(ldt_tag.pos_);
}
}
}
return tags;
}
public void readWrongFile(String filename) {
LineIterator iterator = new LineIterator(filename);
while (iterator.hasNext()) {
List<String> line = iterator.next();
if (line.size() > 3) {
String form = LatMorNormalizer.normalize(line.get(1));
Set<LdtMorphTag> tags = new HashSet<LdtMorphTag>();
for (String tag_string : line.subList(3, line.size())) {
LdtMorphTag tag = new LdtMorphTag();
switch (tag_string) {
case "a":
tag.pos_ = Pos.a;
break;
case "n":
tag.pos_ = Pos.n;
break;
case "p":
tag.pos_ = Pos.p;
break;
default:
System.err.println("Unknown tag: " + tag_string);
}
if (tag.pos_ != Pos.Undef) {
tags.add(tag);
}
}
if (tags.size() > 0) {
Map<String, Set<LdtMorphTag>> readings = dict_.get(form);
if (readings == null) {
readings = new HashMap<String, Set<LdtMorphTag>>();
dict_.put(form, readings);
}
readings.put(line.get(2), tags);
}
}
}
}
public Set<String> getLemmas(String form) {
Map<String, Set<LdtMorphTag>> lemmas = dict_.get(form);
if (lemmas == null) {
return null;
}
return lemmas.keySet();
}
}