// Copyright 2013 Thomas Müller
// This file is part of MarMoT, which is licensed under GPLv3.
package marmot.morph.mapper.czech;
import java.io.IOException;
import java.io.Writer;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import marmot.morph.mapper.czech.MsdTag.Pos;
import marmot.util.Counter;
import marmot.util.FileUtils;
import marmot.util.LineIterator;
// Based on http://nl.ijs.si/ME/V4/msd/html/msd-cs.html
public class MsdReader {
public static Map<String, Counter<String>> getDict(String filename) {
Map<String, Counter<String>> map = new HashMap<String, Counter<String>>();
LineIterator iterator = new LineIterator(filename);
MsdReader reader = new MsdReader();
while (iterator.hasNext()) {
List<String> line = iterator.next();
if (!line.isEmpty()) {
String msd_tag = line.get(2);
MsdTag tag = reader.parse(msd_tag);
Counter<String> counter = map.get(line.get(1));
if (counter == null) {
counter = new Counter<String>();
map.put(line.get(1), counter);
}
counter.increment(tag.toHumanString(), 1.);
}
}
return map;
}
static public void main(String[] args) throws IOException {
LineIterator iterator = new LineIterator(args[0]);
Writer writer = FileUtils.openFileWriter(args[1]);
int num_words = 0;
int token_index = 0;
MsdReader reader = new MsdReader();
while (iterator.hasNext()) {
List<String> line = iterator.next();
if (!line.isEmpty()) {
String msd_tag = line.get(2);
MsdTag tag = reader.parse(msd_tag);
writer.write(Integer.toString(token_index));
writer.write('\t');
writer.write(line.get(0));
writer.write('\t');
writer.write(line.get(1));
writer.write('\t');
writer.write(tag.toPosString());
writer.write('\t');
writer.write(tag.toPosString());
writer.write('\t');
writer.write(tag.toHumanMorphString());
writer.write('\t');
writer.write('0');
writer.write('\t');
writer.write('_');
token_index++;
num_words++;
} else {
token_index = 0;
if (num_words > 50000) {
writer.write('\n');
break;
}
}
writer.write('\n');
}
writer.close();
}
public MsdTag parse(String msd_tag) {
msd_tag = msd_tag.replace('-', '_').toLowerCase();
MsdTag tag = new MsdTag();
if (!msd_tag.startsWith("#")) {
assert msd_tag.equals("c");
tag.pos_ = MsdTag.Pos.z;
return tag;
}
msd_tag = msd_tag.substring(1);
char pos_char = msd_tag.charAt(0);
switch (pos_char) {
case 'v':
parseVerb(tag, msd_tag);
break;
case 'a':
parseAdj(tag, msd_tag);
break;
case 'c':
parseConjunction(tag, msd_tag);
break;
case 'n':
parseNoun(tag, msd_tag);
break;
case 'm':
parseNumeral(tag, msd_tag);
break;
case 's':
parseAdposition(tag, msd_tag);
break;
case 'r':
parseAdverb(tag, msd_tag);
break;
case 'q':
tag.pos_ = MsdTag.Pos.q;
parseGeneric(tag, msd_tag);
break;
case 'p':
parsePronoun(tag, msd_tag);
break;
case 'x':
tag.pos_ = MsdTag.Pos.x;
parseGeneric(tag, msd_tag);
break;
case 'y':
tag.pos_ = MsdTag.Pos.y;
parseGeneric(tag, msd_tag);
break;
case 'i':
tag.pos_ = MsdTag.Pos.i;
parseGeneric(tag, msd_tag);
break;
default:
throw new RuntimeException("Unknown POS: " + pos_char);
}
return tag;
}
private void parsePronoun(MsdTag tag, String msd_tag) {
tag.pos_ = MsdTag.Pos.p;
for (int index = 1; index < msd_tag.length(); index++) {
char c = msd_tag.charAt(index);
if (c == '_') {
continue;
}
switch (index) {
case 1:
setType(tag, c);
break;
case 2:
setPerson(tag, c);
break;
case 3:
setGender(tag, c);
break;
case 4:
setNumber(tag, c);
break;
case 5:
setCase(tag, c);
break;
case 6:
// ignore owner number
break;
case 7:
// ignore owner gender
break;
case 8:
// ignore clitic
break;
case 9:
// ignore referent tpye
break;
case 10:
// ignore syntactic type
break;
case 12:
// setAnimate(tag, c);
break;
case 13:
// ignore clitic
break;
default:
throw new RuntimeException("Unexpected Index: " + index);
}
}
}
private void parseGeneric(MsdTag tag, String msd_tag) {
for (int index = 1; index < msd_tag.length(); index++) {
char c = msd_tag.charAt(index);
if (c == '_') {
continue;
}
switch (index) {
case 1:
setType(tag, c);
break;
default:
throw new RuntimeException("Unexpected Index: " + index);
}
}
}
private void parseAdverb(MsdTag tag, String msd_tag) {
tag.pos_ = MsdTag.Pos.r;
for (int index = 1; index < msd_tag.length(); index++) {
char c = msd_tag.charAt(index);
if (c == '_') {
continue;
}
switch (index) {
case 1:
setType(tag, c);
break;
case 2:
setDegree(tag, c);
break;
default:
throw new RuntimeException("Unexpected Index: " + index);
}
}
}
private void parseAdposition(MsdTag tag, String msd_tag) {
tag.pos_ = MsdTag.Pos.s;
for (int index = 1; index < msd_tag.length(); index++) {
char c = msd_tag.charAt(index);
if (c == '_') {
continue;
}
switch (index) {
case 1:
setType(tag, c);
break;
case 2:
// Ignore Formation;
break;
case 3:
setCase(tag, c);
break;
default:
throw new RuntimeException("Unexpected Index: " + index);
}
}
}
private void parseNumeral(MsdTag tag, String msd_tag) {
tag.pos_ = MsdTag.Pos.m;
for (int index = 1; index < msd_tag.length(); index++) {
char c = msd_tag.charAt(index);
if (c == '_') {
continue;
}
switch (index) {
case 1:
setType(tag, c);
break;
case 2:
setGender(tag, c);
break;
case 3:
setNumber(tag, c);
break;
case 4:
setCase(tag, c);
break;
case 5:
// Ignore Form
break;
case 8:
// Ignore Class
break;
case 9:
// setAnimate(tag, c);
break;
default:
throw new RuntimeException("Unexpected Index: " + index);
}
}
}
private void parseNoun(MsdTag tag, String msd_tag) {
tag.pos_ = MsdTag.Pos.n;
for (int index = 1; index < msd_tag.length(); index++) {
char c = msd_tag.charAt(index);
if (c == '_') {
continue;
}
switch (index) {
case 1:
setType(tag, c);
break;
case 2:
setGender(tag, c);
break;
case 3:
setNumber(tag, c);
break;
case 4:
setCase(tag, c);
break;
case 7:
// setAnimate(tag, c);
break;
default:
throw new RuntimeException("Unexpected Index: " + index);
}
}
}
private void parseConjunction(MsdTag tag, String msd_tag) {
tag.pos_ = MsdTag.Pos.c;
for (int index = 1; index < msd_tag.length(); index++) {
char c = msd_tag.charAt(index);
if (c == '_') {
continue;
}
switch (index) {
case 1:
setType(tag, c);
break;
case 6:
setNumber(tag, c);
break;
case 7:
setPerson(tag, c);
break;
default:
throw new RuntimeException("Unexpected Index: " + index);
}
}
}
private void parseAdj(MsdTag tag, String msd_tag) {
tag.pos_ = MsdTag.Pos.a;
for (int index = 1; index < msd_tag.length(); index++) {
char c = msd_tag.charAt(index);
if (c == '_') {
continue;
}
switch (index) {
case 1:
setType(tag, c);
break;
case 2:
setDegree(tag, c);
break;
case 3:
setGender(tag, c);
break;
case 4:
setNumber(tag, c);
break;
case 5:
setCase(tag, c);
break;
case 8:
// setAnimate(tag, c);
break;
case 9:
// setFormation(tag, c);
break;
default:
throw new RuntimeException("Unexpected Index: " + index);
}
}
}
// private void setFormation(MsdTag tag, char c) {
// tag.formation_ = MsdTag.Formation.valueOf(Character.toString(c));
// }
private void setCase(MsdTag tag, char c) {
tag.case_ = MsdTag.Case.valueOf(Character.toString(c));
}
private void setDegree(MsdTag tag, char c) {
if (tag.pos_ == Pos.r) {
return;
}
tag.degree_ = MsdTag.Degree.valueOf(Character.toString(c));
}
private void parseVerb(MsdTag tag, String msd_tag) {
tag.pos_ = MsdTag.Pos.v;
for (int index = 1; index < msd_tag.length(); index++) {
char c = msd_tag.charAt(index);
if (c == '_') {
continue;
}
switch (index) {
case 1:
setType(tag, c);
break;
case 2:
setMood(tag, c);
break;
case 3:
setTense(tag, c);
break;
case 4:
setPerson(tag, c);
break;
case 5:
setNumber(tag, c);
break;
case 6:
setGender(tag, c);
break;
case 7:
setVoice(tag, c);
break;
case 8:
// setNegative(tag, c);
break;
case 12:
// setAnimate(tag, c);
break;
case 13:
// Ignore clitics
break;
default:
throw new RuntimeException("Unexpected Index: " + index);
}
}
}
private void setType(MsdTag tag, char c) {
tag.type_ = MsdTag.Type.valueOf(Character.toString(c));
}
// private void setAnimate(MsdTag tag, char c) {
// tag.animate_ = MsdTag.Animate.valueOf(Character.toString(c));
// }
// private void setNegative(MsdTag tag, char c) {
// tag.negative_ = MsdTag.Negative.valueOf(Character.toString(c));
// }
private void setVoice(MsdTag tag, char c) {
tag.voice_ = MsdTag.Voice.valueOf(Character.toString(c));
}
private void setGender(MsdTag tag, char c) {
tag.gender_ = MsdTag.Gender.valueOf(Character.toString(c));
}
private void setMood(MsdTag tag, char c) {
tag.mood_ = MsdTag.Mood.valueOf(Character.toString(c));
}
private void setNumber(MsdTag tag, char c) {
tag.number_ = MsdTag.Number.valueOf(Character.toString(c));
}
private void setPerson(MsdTag tag, char c) {
switch (c) {
case '1':
tag.person_ = MsdTag.Person.fst;
break;
case '2':
tag.person_ = MsdTag.Person.snd;
break;
case '3':
tag.person_ = MsdTag.Person.thd;
break;
case '_':
tag.person_ = MsdTag.Person._;
break;
default:
throw new RuntimeException("Unknown value: " + c);
}
}
private void setTense(MsdTag tag, char c) {
tag.tense_ = MsdTag.Tense.valueOf(Character.toString(c));
}
}