// Copyright 2013 Thomas Müller
// This file is part of MarMoT, which is licensed under GPLv3.
package marmot.morph.mapper.german;
import java.io.IOException;
import java.io.Writer;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import marmot.morph.mapper.Node;
import marmot.morph.mapper.SyntaxTree;
import marmot.morph.mapper.SyntaxTreeIterator;
import marmot.util.Counter;
import marmot.util.FileUtils;
import marmot.util.LineIterator;
public class SmorReader {
public static void main(String[] args) throws IOException {
SmorReader reader = new SmorReader();
Map<String, Set<SttsTag>> dict = reader.readFile(args[0]);
Map<String, Set<SttsTag>> pos_dict = reader.readPosFile(args[1]);
for (int i = 1; i<args.length; i++) {
System.err.println(args[i]);
reader.test(dict, pos_dict, args[i]);
}
}
private void test(Map<String, Set<SttsTag>> dict,
Map<String, Set<SttsTag>> pos_dict, String string)
throws IOException {
Writer writer = FileUtils.openFileWriter(string + ".lattice");
SyntaxTreeIterator iterator = new SyntaxTreeIterator(string, 1, 2, 4,
6, 8, 10, false);
int pos_candidates = 0;
int pos_correct = 0;
int correct = 0;
int total = 0;
int candidates = 0;
int covered = 0;
Counter<String> candidate_counter = new Counter<String>();
Counter<String> total_counter = new Counter<String>();
while (iterator.hasNext()) {
SyntaxTree tree = iterator.next();
for (Node node : tree.getNodes()) {
String pos = node.getPos();
// if (!pos.equals("NN")) {
// continue;
// }
Set<SttsTag> pos_set = pos_dict.get(pos);
if (check(node.getFeats(), pos_set)) {
pos_correct += 1;
}
pos_candidates += pos_set.size();
if (pos_set.size() > 1) {
Set<SttsTag> form_set = dict.get(normalize(node.getForm(),
pos));
if (form_set != null) {
covered++;
Set<SttsTag> set = mergeSets(pos_set, form_set);
if (!set.isEmpty()) {
pos_set = set;
}
}
}
if (check(node.getFeats(), pos_set)) {
correct += 1;
}
candidates += pos_set.size();
total += 1;
// candidate_counter.increment(node.getPos(), (double
// )pos_set.size());
// total_counter.increment(node.getPos(), (double ) 1);
writer.write(node.getForm());
if (pos_set.size() < 5) {
for (SttsTag tag : pos_set) {
writer.write(' ');
writer.write(tag.feat_string_);
}
} else {
writer.write(' ');
writer.write('*');
}
writer.write('\n');
}
writer.write('\n');
}
System.err.println("pos correct: " + pos_correct * 100. / total);
System.err
.println("pos candidates: " + pos_candidates / (double) total);
System.err.println("correct: " + correct * 100. / total);
System.err.println("candidates: " + candidates / (double) total);
System.err.println("coverage: " + covered / (double) total);
for (Map.Entry<String, Double> entry : candidate_counter.entrySet()) {
System.err.println(entry.getKey() + ":" + entry.getValue()
/ total_counter.count(entry.getKey()));
}
writer.close();
}
private String normalize(String form, String pos) {
if (pos.equals("NE")) {
return form;
}
StringBuilder sb = new StringBuilder(form.toLowerCase());
if (pos.equals("NN")) {
sb.setCharAt(0, Character.toUpperCase(sb.charAt(0)));
}
return sb.toString();
}
boolean check(String tag_string, Set<SttsTag> set) {
for (SttsTag tag : set) {
if (tag.feat_string_.equals(tag_string)) {
return true;
}
}
return false;
}
private static Set<SttsTag> mergeSets(Set<SttsTag> pos_set,
Set<SttsTag> form_set) {
Set<SttsTag> set = new HashSet<SttsTag>();
for (SttsTag tag : pos_set) {
for (SttsTag form_tag : form_set) {
if (match(tag, form_tag)) {
set.add(tag);
break;
}
}
}
return set;
}
static boolean match(SttsTag tag, SttsTag form_tag) {
Matcher m = new Matcher();
m.add(tag.case_.toString(), form_tag.case_.toString());
m.add(tag.gender_.toString(), form_tag.gender_.toString());
m.add(tag.number_.toString(), form_tag.number_.toString());
m.add(tag.person_.toString(), form_tag.person_.toString());
m.add(tag.tense_.toString(), form_tag.tense_.toString());
m.add(tag.mood_.toString(), form_tag.mood_.toString());
m.add(tag.degree_.toString(), form_tag.degree_.toString());
return m.matches();
}
private Map<String, Set<SttsTag>> readPosFile(String string) {
Map<String, Set<SttsTag>> map = new HashMap<String, Set<SttsTag>>();
SyntaxTreeIterator iterator = new SyntaxTreeIterator(string, 1, 2, 4,
6, 8, 10, false);
while (iterator.hasNext()) {
SyntaxTree tree = iterator.next();
for (Node node : tree.getNodes()) {
Set<SttsTag> set = map.get(node.getPos());
if (set == null) {
set = new HashSet<SttsTag>();
map.put(node.getPos(), set);
}
set.add(parseSeekerTag(node.getFeats()));
}
}
return map;
}
private SttsTag parseSeekerTag(String feats) {
SttsTag tag = new SttsTag();
// setPos(tag, pos);
String[] features = feats.split("\\|");
for (String feature : features) {
if (!feature.isEmpty()) {
setSeekerFeature(tag, feature);
}
}
tag.setFeatString(feats);
return tag;
}
private void setSeekerFeature(SttsTag tag, String feature) {
if (feature.equals("_"))
return;
String[] key_value = feature.toLowerCase().split("=");
String key = key_value[0];
String value = key_value[1];
switch (key) {
case "case":
if (value.equals("*")) {
tag.case_ = SttsTag.Case.amb;
} else {
tag.case_ = SttsTag.Case.valueOf(value);
}
break;
case "number":
if (value.equals("*")) {
tag.number_ = SttsTag.Number.amb;
} else {
tag.number_ = SttsTag.Number.valueOf(value);
}
break;
case "gender":
if (value.equals("*")) {
tag.gender_ = SttsTag.Gender.amb;
} else {
tag.gender_ = SttsTag.Gender.valueOf(value);
}
break;
case "person":
switch (value) {
case "1":
tag.person_ = SttsTag.Person.fst;
break;
case "2":
tag.person_ = SttsTag.Person.snd;
break;
case "3":
tag.person_ = SttsTag.Person.thd;
break;
}
break;
case "tense":
tag.tense_ = SttsTag.Tense.valueOf(value);
break;
case "mood":
tag.mood_ = SttsTag.Mood.valueOf(value);
break;
case "degree":
if (value.equals("*")) {
tag.degree_ = SttsTag.Degree.amb;
} else {
tag.degree_ = SttsTag.Degree.valueOf(value);
}
break;
default:
throw new RuntimeException("Unknown key: " + key);
}
}
public Map<String, Set<SttsTag>> readFile(String filename) {
Map<String, Set<SttsTag>> dict = new HashMap<String, Set<SttsTag>>();
LineIterator iterator = new LineIterator(filename);
while (iterator.hasNext()) {
List<String> line = iterator.next();
if (!line.isEmpty()) {
String form = line.get(0);
Set<SttsTag> readings = dict.get(form);
if (readings == null) {
readings = new HashSet<SttsTag>();
dict.put(form, readings);
}
SttsTag tag = parseMorphTagString(line.get(2), line.get(3));
readings.add(tag);
}
}
return dict;
}
public SttsTag parseMorphTagString(String pos, String morph_tag_string) {
SttsTag tag = new SttsTag();
// setPos(tag, pos);
String[] features = morph_tag_string.split("[<>]");
for (String feature : features) {
if (!feature.isEmpty()) {
setFeature(tag, feature);
}
}
return tag;
}
// private void setPos(SttsTag tag, String pos) {
// assert pos.startsWith("<+");
// assert pos.endsWith(">");
//
// tag.pos_ = SttsTag.Pos.valueOf(pos.substring(2, pos.length() - 1));
// }
public static void setFeature(SttsTag tag, String feature) {
feature = feature.toLowerCase();
switch (feature) {
// SubPoS
// case "adv":
// tag.type_ = SttsTag.Type.ADJA;
// break;
//
// case "pred":
// if (tag.pos_ == SttsTag.Pos.ADJ) {
// tag.type_ = SttsTag.Type.ADJA;
// }
// break;
//
// case "comma":
// tag.type_ = SttsTag.Type.Comma;
// break;
//
// case "norm":
// tag.type_ = SttsTag.Type.Period;
// break;
//
// case "left":
// case "right":
// tag.type_ = SttsTag.Type.Bracket;
// break;
//
// case "coord":
// tag.type_ = SttsTag.Type.KON;
// break;
//
// case "compar":
// tag.type_ = SttsTag.Type.KOKOM;
// break;
//
// case "sub":
// tag.type_ = SttsTag.Type.KOUS; // KOUI
// break;
//
// case "ppres":
// tag.type_ = SttsTag.Type.ADJD;
// tag.pos_ = SttsTag.Pos.ADJ;
// break;
//
// case "imp":
// tag.type_ = SttsTag.Type.VVIMP;
// break;
//
// case "inf":
// tag.type_ = Type.VVINF;
// break;
//
// case "zu":
// tag.type_ = Type.VVIZU;
// break;
//
// case "ppast":
// tag.type_ = Type.VVPP;
// break;
//
// case "ans":
// tag.type_ = Type.PTKANT;
// break;
// Case
case "acc":
tag.case_ = SttsTag.Case.acc;
break;
case "dat":
tag.case_ = SttsTag.Case.dat;
break;
case "gen":
tag.case_ = SttsTag.Case.gen;
break;
case "nom":
tag.case_ = SttsTag.Case.nom;
break;
// Gender
case "fem":
tag.gender_ = SttsTag.Gender.fem;
break;
case "masc":
tag.gender_ = SttsTag.Gender.masc;
break;
case "neut":
tag.gender_ = SttsTag.Gender.neut;
break;
// Number
case "sg":
tag.number_ = SttsTag.Number.sg;
break;
case "pl":
tag.number_ = SttsTag.Number.pl;
break;
// Degree
case "pos":
tag.degree_ = SttsTag.Degree.pos;
break;
case "comp":
tag.degree_ = SttsTag.Degree.comp;
break;
case "sup":
tag.degree_ = SttsTag.Degree.sup;
break;
// Person
case "1":
tag.person_ = SttsTag.Person.fst;
break;
case "2":
tag.person_ = SttsTag.Person.snd;
break;
case "3":
tag.person_ = SttsTag.Person.thd;
break;
// Tense
case "pres":
tag.tense_ = SttsTag.Tense.pres;
break;
case "past":
tag.tense_ = SttsTag.Tense.past;
break;
// Mood
case "ind":
tag.mood_ = SttsTag.Mood.ind;
break;
case "subj":
tag.mood_ = SttsTag.Mood.subj;
break;
// case "attr":
//
// switch (tag.pos_) {
//
// case DEM:
// tag.type_ = Type.PDAT;
// break;
// case INDEF:
// tag.type_ = Type.PIAT; // PIDAT
// break;
// case POSS:
// tag.type_ = Type.PPOSAT;
// break;
// case REL:
// tag.type_ = Type.PRELAT;
// break;
// case WPRO:
// tag.type_ = Type.PWAT;
// break;
//
// default:
// System.err.println("attr" + tag.pos_);
// break;
// }
//
//
// break;
// case "subst":
//
// switch (tag.pos_) {
//
// case DEM:
// tag.type_ = Type.PDS;
// break;
// case INDEF:
// tag.type_ = Type.PIS;
// break;
// case POSS:
// tag.type_ = Type.PPOSS;
// break;
// case REL:
// tag.type_ = Type.PRELS;
// break;
// case WPRO:
// tag.type_ = Type.PWS;
// break;
//
// default:
// System.err.println("subst " + tag.pos_);
// break;
// }
//
//
// break;
case "_":
case "pro":
case "nogend":
case "wk": // weak
case "st": // strong
case "old": // old dative
case "invar":
case "simp":
case "adj":
case "def":
case "pers":
case "refl":
case "indef":
case "rec":
case "neg":
case "comma":
case "norm":
case "left":
case "right":
case "adv":
case "pred":
case "inf":
case "ppast":
case "coord":
case "ppres":
case "imp":
case "zu":
case "compar":
case "sub":
case "ans":
case "attr":
case "subst":
break;
default:
throw new RuntimeException("Unknown feature: " + feature);
}
}
}