// Copyright 2013 Thomas Müller
// This file is part of MarMoT, which is licensed under GPLv3.
package marmot.morph.mapper.spanish;
import java.util.HashSet;
import java.util.Set;
import marmot.morph.mapper.MorphTag;
import marmot.morph.mapper.Names;
import marmot.morph.mapper.Node;
// Based on http://nlp.lsi.upc.edu/freeling/doc/tagsets/tagset-es.html
// Modifcations:
// based on the data (CoNLL 2009 dataset, IULA treebank) all numbers don't get types
// No adjective degree and noun degree in CoNLL 2009
// Weird type "numeral" in CoNLL:
// - dos, tres, ... seem to be tagged as z in IULA and as p,d in CoNLL
// - other words like "ambos" are of type indefinite in IULA and of type numeral in CoNLL
// - current treatment: move numeral to indefinite
// - move certain forms to z (un, uno, dos, tres, ...)
// IULA doesn't use fun=p it uses verbs with mood=p instead:
// - CoNLL a type=q|num=.|gen=.|fun=p -> c type=m|num=.|gen=.|mood=p
// IULA annotates all prepositions as SPS00
// In CoNLL proper nouns have common gender and invariable number.
// In CoNLL verbs might have common gender and invariable number
// In ConLL and IULA "se" is not annotated as PP3CN000
public class EaglesTag implements MorphTag {
Pos pos_;
Type type_;
Degree degree_;
Gender gender_;
Number number_;
Function function_;
Mood mood_;
Tense tense_;
Person person_;
Case case_;
OwnerNumber owner_;
Politeness politeness_;
Form form_;
Closing closing_;
NounDegree noun_degree_;
enum Pos {
a, // Adjective
c, // Conjunction
d, // Determiner
f, // Punctuation
i, // Interjection
n, // Noun
p, // Pronoun
r, // Adverbs
s, // Preposition
v, // Verb
w, // Date
z, // Numeral
_, // Undef
}
enum Type {
a, // article, auxiliary, exclamationmark
c, // common, coordinating, comma
d, // demonstrative, colon
e, // exclamative, quotation
g, // general, hyphen
h, // slash
i, // indefinite, question mark
t, // interrogative, percentage
m, // main, principal, currency
n, // negative
o, // ordinal
p, // possessive (determiner), proper, personal, preposition, period,
// bracket
q, // qualificative
r, // relative, «/»)
s, // semiauxiliary, subordinating, etc
x, // possesive (pronoun) , semicolon
z, // mathsign
_, // undef,
}
enum Degree {
c, // dimunitive
s, // superlative
_, // undef
}
enum Gender {
m, // masculine
f, // femine
n, // neuter
c, // common
_, // undef
}
enum Number {
s, // singular
p, // plural
n, // invariable
_, // undef
}
enum Function {
p, // participle
_, // undef
}
enum Mood {
i, // Indicativo
s, // Subjuntivo
m, // Imperativo
n, // Infinitivo
g, // Gerundio
p, // Participio
_, // Undef
};
enum Tense {
p, // Presente
i, // Imperfecto
f, // Futuro
s, // Pasado
c, // Condicional
_, // Undef
}
enum Person {
first, second, third, _
}
enum Case {
n, // nominative
a, // accusative
d, // dative
o, // oblicuo
_, // undef
}
enum OwnerNumber {
s, // Singular
p, // Plural
_, // undef
}
enum Politeness {
p, // Polite
_, // Undef
}
enum Form {
s, // simple
c, // contracted
_, // undef
}
enum Closing {
a, // opening
t, // closing
_
}
enum NounDegree {
d, // dimunitive
a, // aumentative
_, // undef
}
public EaglesTag() {
reset();
}
public void reset() {
pos_ = Pos._;
type_ = Type._;
degree_ = Degree._;
gender_ = Gender._;
number_ = Number._;
function_ = Function._;
mood_ = Mood._;
tense_ = Tense._;
person_ = Person._;
case_ = Case._;
owner_ = OwnerNumber._;
politeness_ = Politeness._;
form_ = Form._;
closing_ = Closing._;
noun_degree_ = NounDegree._;
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append(pos_);
sb.append(type_);
sb.append(degree_);
sb.append(gender_);
sb.append(number_);
sb.append(function_);
sb.append(mood_);
sb.append(tense_);
sb.append(person_);
sb.append(case_);
sb.append(owner_);
sb.append(politeness_);
sb.append(form_);
sb.append(closing_);
sb.append(noun_degree_);
return sb.toString();
}
public String toHumanMorphString() {
StringBuilder sb = new StringBuilder();
addFeature(sb, Names.Type, type_ == Type._,
type_.toString());
addFeature(sb, Names.Degree, degree_ == Degree._,
degree_.toString());
addFeature(sb, Names.Number, number_ == Number._, number_.toString());
addFeature(sb, Names.Gender, gender_ == Gender._, gender_.toString());
addFeature(sb, Names.Function, function_ == Function._, function_.toString());
addFeature(sb, Names.Mood, mood_ == Mood._,
mood_.toString());
addFeature(sb, Names.Tense, tense_ == Tense._,
tense_.toString());
addFeature(sb, Names.Person, person_ == Person._, person_.toString());
addFeature(sb, Names.Case, case_ == Case._, case_.toString());
addFeature(sb, Names.OwnerNumber, owner_ == OwnerNumber._, owner_.toString());
addFeature(sb, Names.Politeness, politeness_ == Politeness._, politeness_.toString());
addFeature(sb, Names.Form, form_ == Form._,
form_.toString());
addFeature(sb, Names.Closing, closing_ == Closing._, closing_.toString());
addFeature(sb, Names.NounDegree, noun_degree_ == NounDegree._, noun_degree_.toString());
if (sb.length() == 0) {
return "_";
}
return sb.toString();
}
public String toHumanString() {
StringBuilder sb = new StringBuilder();
addFeature(sb, "", false, pos_.toString());
sb.append("|");
sb.append(toHumanMorphString());
return sb.toString();
}
private void addFeature(StringBuilder sb, String name, boolean b,
String value) {
if (!b) {
switch (value) {
case "first":
value = "1";
break;
case "second":
value = "2";
break;
case "third":
value = "3";
break;
default:
break;
}
if (sb.length() > 0)
sb.append('|');
if (name.length() > 0) {
sb.append(name.toLowerCase());
sb.append('=');
}
sb.append(value);
}
}
private final static Set<String> numbers = new HashSet<String>();
static {
numbers.add("catorce");
numbers.add("cero");
numbers.add("cien");
numbers.add("cien_mil");
numbers.add("cien_por_cien");
numbers.add("cien_por_ciento");
numbers.add("ciento_ochenta");
numbers.add("ciento_sesenta_mil_millones");
numbers.add("ciento_setenta_y_ocho");
numbers.add("cinco");
numbers.add("cinco_mil");
numbers.add("cinco_mil_millones");
numbers.add("cinco_por_ciento");
numbers.add("cincuenta");
numbers.add("cincuenta_por_ciento");
numbers.add("cincuenta_y_dos");
numbers.add("cincuenta_y_uno");
numbers.add("cuarenta");
numbers.add("cuarenta_y_cinco");
numbers.add("cuarenta_y_dos");
numbers.add("cuarenta_y_ocho");
numbers.add("cuatro");
numbers.add("cuatro_de_cada_diez");
numbers.add("cuatro_millones");
numbers.add("cuatro_mil_millones");
numbers.add("cuatro_por_ciento");
numbers.add("cuatro_por_mil");
numbers.add("diecinueve");
numbers.add("dieciocho");
numbers.add("dieciséis");
numbers.add("diecisiete");
numbers.add("diez");
numbers.add("diez_mil");
numbers.add("diez_millones");
numbers.add("diez_por_ciento");
numbers.add("doce");
numbers.add("dos");
numbers.add("doscientas_cincuenta");
numbers.add("dos_millones");
numbers.add("dos_mil_millones");
numbers.add("dos_mil_quinientas");
numbers.add("dos_por_ciento");
numbers.add("dos_por_mil");
numbers.add("media_docena");
numbers.add("mil");
numbers.add("mil_millones");
numbers.add("mil_seiscientas");
numbers.add("noventa");
numbers.add("noventa_por_ciento");
numbers.add("nueve");
numbers.add("nueve_de_cada_diez");
numbers.add("nueve_mil");
numbers.add("ochenta");
numbers.add("ocho");
numbers.add("ocho_de_cada_diez");
numbers.add("ocho_por_ciento");
numbers.add("once");
numbers.add("quince");
numbers.add("quince_por_ciento");
numbers.add("quinientas");
numbers.add("quinientos_mil");
numbers.add("quinientos_un");
numbers.add("seis");
numbers.add("seis_millones");
numbers.add("seis_por_ciento");
numbers.add("sesenta");
numbers.add("sesenta_y_cinco");
numbers.add("sesenta_y_nueve");
numbers.add("sesenta_y_ocho");
numbers.add("sesenta_y_seis");
numbers.add("sesenta_y_siete");
numbers.add("sesenta_y_un");
numbers.add("setenta");
numbers.add("siete");
numbers.add("siete_mil");
numbers.add("siete_por_ciento");
numbers.add("tanto_por_ciento");
numbers.add("treinta");
numbers.add("treinta_mil_millones");
numbers.add("tres");
numbers.add("trescientas_sesenta_y_cinco");
numbers.add("trescientos_doce");
numbers.add("tres_de_cada_cuatro");
numbers.add("tres_mil");
numbers.add("tres_millones");
numbers.add("tres_mil_millones");
numbers.add("tres_por_ciento");
numbers.add("un");
numbers.add("una");
numbers.add("una_de_cada_cuatro");
numbers.add("una_docena");
numbers.add("un_centenar");
numbers.add("un_millar");
numbers.add("un_millón");
numbers.add("uno");
numbers.add("uno_de_cada_diez");
numbers.add("uno_por_ciento");
numbers.add("veinte");
numbers.add("veinte_mil");
numbers.add("veinte_mil_millones");
numbers.add("veinte_por_ciento");
numbers.add("veinticinco");
numbers.add("veinticinco_mil");
numbers.add("veinticinco_mil_millones");
numbers.add("veinticuatro");
numbers.add("veintitrés");
numbers.add("veintiuno");
}
public void normalize(Node node, boolean iula) {
String form = node.getForm();
degree_ = EaglesTag.Degree._;
noun_degree_ = EaglesTag.NounDegree._;
if (form.equals("se")) {
reset();
pos_ = Pos.p;
type_ = Type.r;
number_ = Number.n;
gender_ = Gender.c;
person_ = Person.third;
return;
}
switch (pos_) {
case a:
if (function_ == Function.p) {
pos_ = Pos.v;
function_ = Function._;
mood_ = Mood.p;
type_ = Type.m;
}
break;
case d:
case p:
if (numbers.contains(form)) {
reset();
pos_ = Pos.z;
}
break;
case s:
reset();
pos_ = Pos.s;
type_ = Type.p;
form_ = Form.s;
break;
case n:
if (type_ == Type.p) {
reset();
pos_ = Pos.n;
type_ = Type.p;
}
break;
case v:
if (gender_ == Gender.c) {
gender_ = Gender._;
}
if (number_ == Number.n) {
number_ = Number._;
}
if (node.getLemma().equals("ser")) {
type_ = Type.s;
} else {
if (node.getLemma().equals("estar")) {
boolean found_gerund = false;
if (!iula) {
Node head = node.getHead();
if (head != null) {
EaglesTag tag = (EaglesTag) head.getMorphTag();
if (tag.pos_ == EaglesTag.Pos.v
&& tag.mood_ == Mood.g) {
found_gerund = true;
}
}
} else {
for (Node child : node.getChildren()) {
EaglesTag tag = (EaglesTag) child.getMorphTag();
if (tag.pos_ == EaglesTag.Pos.v
&& tag.mood_ == Mood.g) {
found_gerund = true;
break;
}
}
}
if (found_gerund) {
type_ = Type.a;
} else {
type_ = Type.m;
}
}
}
break;
case r:
if (type_ == Type.n) {
reset();
pos_ = Pos.r;
type_ = Type.n;
} else {
pos_ = Pos.r;
type_ = Type.g;
}
break;
default:
break;
}
}
@Override
public String toPosString() {
return pos_.toString();
}
}