package edu.stanford.nlp.international.spanish.process;
import edu.stanford.nlp.international.spanish.SpanishVerbStripper;
import edu.stanford.nlp.util.logging.Redwood;
import edu.stanford.nlp.util.Pair;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
/**
* A utility for preprocessing the AnCora Spanish corpus.
*
* Attempts to disambiguate Spanish personal pronouns which have
* multiple senses:
*
* <em>me, te, se, nos, os</em>
*
* Each of these can be used as 1) an indirect object pronoun or as
* 2) a reflexive pronoun. (<em>me, te, nos,</em> and <em>os</em> can
* also be used as direct object pronouns.)
*
* For the purposes of corpus preprocessing, all we need is to
* distinguish between the object- and reflexive-pronoun cases.
*
* Disambiguation is done first by (dictionary-powered) heuristics, and
* then by brute force. The brute-force decisions are manual tags for
* verbs with clitic pronouns which appear in the AnCora corpus.
*
* @author Jon Gauthier
* @see edu.stanford.nlp.trees.international.spanish.SpanishTreeNormalizer
*/
public class AnCoraPronounDisambiguator {
/** A logger for this class */
private static Redwood.RedwoodChannels log = Redwood.channels(AnCoraPronounDisambiguator.class);
public static enum PersonalPronounType {OBJECT, REFLEXIVE, UNKNOWN}
private static final Set<String> ambiguousPersonalPronouns = new HashSet<>(Arrays.asList(
"me", "te", "se", "nos", "os"
));
/**
* The following verbs always use ambiguous pronouns in a reflexive
* sense in the corpus.
*/
private static final Set<String> alwaysReflexiveVerbs = new HashSet<>(Arrays.asList(
"acercar",
"acostumbrar",
"adaptar",
"afeitar",
"agarrar",
"ahincar",
"alegrar",
"Anticipar",
"aplicar",
"aprobar",
"aprovechar",
"asegurar",
"Atreve",
"bajar",
"beneficiar",
"callar",
"casar",
"cobrar",
"colocar",
"comer",
"comportar",
"comprar",
"concentrar",
"cuidar",
"deber",
"decidir",
"defender",
"desplazar",
"detectar",
"divirtiendo",
"echar",
"encontrar",
"enfrentar",
"entender",
"enterar",
"entrometer",
"equivocar",
"escapar",
"esconder",
"esforzando",
"establecer",
"felicitar",
"fija",
"Fija",
"ganar",
"guarda",
"guardar",
"Habituar",
"hacer",
"imagina",
"imaginar",
"iniciar",
"inscribir",
"ir",
"jode",
"jugar",
"Levantar",
"Manifestar",
"mantener",
"marchar",
"meter",
"Negar",
"obsesionar",
"Olvida",
"Olvidar",
"olvidar",
"oponer",
"Para",
"pasar",
"plantear",
"poner",
"pudra",
"queda",
"quedar",
"querer",
"quita",
"reciclar",
"reconoce",
"reconstruir",
"recordar",
"recuperar",
"reencontrar",
"referir",
"registrar",
"reincorporar",
"rendir",
"reservar",
"retirar",
"reunir",
"sentar",
"sentir",
"someter",
"subir",
"tirando",
"toma",
"tomar",
"tomen",
"Une",
"unir",
"Ve",
"vestir"
));
/**
* The following verbs always use ambiguous clitic pronouns in an
* object sense **in the corpora supported.**
*
* This does not imply that the below verbs are only ever non-reflexive!
* This list may need to be revised in order to produce correct gold trees
* on new datasets.
*/
private static final Set<String> neverReflexiveVerbs = new HashSet<>(Arrays.asList(
"abrir",
"aguar",
"anunciar",
"arrebatando",
"arruinar",
"clasificar",
"compensar",
"compra",
"comprar",
"concretar",
"contar",
"crea",
"crear",
"Cuente",
"Decir",
"decir",
"deja",
"digan",
"devolver",
"devuelve",
"dirigiendo",
"distraer",
"enfrascar",
"exigiendo",
"exigir",
"haz",
"ignorar",
"impedir",
"insultar",
"juzgar",
"llamar",
"llevando",
"llevar",
"manda",
"mirar",
"Miren",
"multar",
"negar",
"ocultando",
"pagar",
"patear",
"pedir",
"permitir",
"pidiendo",
"preguntar",
"prevenir",
"quitar",
"razona",
"resultar",
"saca",
"sacar",
"saludar",
"seguir",
"servir",
"situar",
"suceder",
"tener",
"tutear",
"utilizar",
"vender",
"ver",
"visitar"
));
/**
* Brute-force: based on clauses which we recognize from AnCora,
* dictate the type of pronoun being used
*
* Map from pair (verb, containing clause) to personal pronoun type
*/
@SuppressWarnings("unchecked")
private static final Map<Pair<String, String>, PersonalPronounType> bruteForceDecisions =
new HashMap<>();
static {
bruteForceDecisions.put(
new Pair<>("contar", "No contarte mi vida nunca más"), PersonalPronounType.OBJECT);
bruteForceDecisions.put(
new Pair<>("Creer", "Creerselo todo"), PersonalPronounType.REFLEXIVE);
bruteForceDecisions.put(
new Pair<>("creer", "creérselo todo ..."), PersonalPronounType.REFLEXIVE);
bruteForceDecisions.put(
new Pair<>("creer", "creerte"), PersonalPronounType.REFLEXIVE);
bruteForceDecisions.put(
new Pair<>("Dar", "Darte de alta ahi"), PersonalPronounType.REFLEXIVE);
bruteForceDecisions.put(
new Pair<>("da", "A mi dame billetes uno al lado del otro que es la forma mas líquida que uno pueda estar"), PersonalPronounType.OBJECT);
bruteForceDecisions.put(
new Pair<>("da", "danos UNA razon UNA"), PersonalPronounType.OBJECT);
bruteForceDecisions.put(
new Pair<>("da", "y ... dame una razon por la que hubiera matado o se hubiera comido a el compañero ?"), PersonalPronounType.OBJECT);
bruteForceDecisions.put(
new Pair<>("dar", "darme cuenta"), PersonalPronounType.REFLEXIVE);
bruteForceDecisions.put(
new Pair<>("dar", "darme la enhorabuena"), PersonalPronounType.OBJECT);
bruteForceDecisions.put(
new Pair<>("dar", "darnos cuenta"), PersonalPronounType.REFLEXIVE);
bruteForceDecisions.put(
new Pair<>("dar", "darselo a la doña"), PersonalPronounType.OBJECT);
bruteForceDecisions.put(
new Pair<>("dar", "darte cuenta"), PersonalPronounType.REFLEXIVE);
bruteForceDecisions.put(
new Pair<>("dar", "darte de alta"), PersonalPronounType.REFLEXIVE);
bruteForceDecisions.put(
new Pair<>("dar", "darte vuelta en cuestiones que no tienen nada que ver con lo que comenzaste diciendo"), PersonalPronounType.REFLEXIVE);
bruteForceDecisions.put(
new Pair<>("dar", "podría darnos"), PersonalPronounType.OBJECT);
bruteForceDecisions.put(
new Pair<>("dar", "puede darnos"), PersonalPronounType.OBJECT);
bruteForceDecisions.put(
new Pair<>("decir", "suele decirnos"), PersonalPronounType.OBJECT);
bruteForceDecisions.put(
new Pair<>("decir", "suelo decírmelo"), PersonalPronounType.REFLEXIVE);
bruteForceDecisions.put(
new Pair<>("dejar", "debería dejarnos faenar"), PersonalPronounType.OBJECT);
bruteForceDecisions.put(
new Pair<>("dejar", "dejarme un intermitente encendido"), PersonalPronounType.REFLEXIVE);
bruteForceDecisions.put(
new Pair<>("dejar", ": dejarnos un país tan limpio en su gobierno como el cielo claro después de las tormentas mediterráneas , que inundan nuestras obras públicas sin encontrar nunca ni un solo responsable político de tanta mala gestión , ya sea la plaza de Cerdà socialista o los incendios forestales de la Generalitat"),
PersonalPronounType.OBJECT);
bruteForceDecisions.put(
new Pair<>("dejar", "podemos dejarnos adormecer"), PersonalPronounType.REFLEXIVE);
bruteForceDecisions.put(
new Pair<>("engañar", "engañarnos"), PersonalPronounType.OBJECT);
bruteForceDecisions.put(
new Pair<>("estira", "=LRB= al menos estirate a los japoneses HDP !!! =RRB="),
PersonalPronounType.REFLEXIVE);
bruteForceDecisions.put(
new Pair<>("explica", "explicame como hago"), PersonalPronounType.OBJECT);
bruteForceDecisions.put(
new Pair<>("explicar", "deberá explicarnos"), PersonalPronounType.OBJECT);
bruteForceDecisions.put(
new Pair<>("liar", "liarme a tiros"), PersonalPronounType.REFLEXIVE);
bruteForceDecisions.put(
new Pair<>("librar", "librarme de el mismo para siempre"),
PersonalPronounType.REFLEXIVE);
bruteForceDecisions.put(
new Pair<>("llevar", "llevarnos a una trampa en esta elección"),
PersonalPronounType.OBJECT);
bruteForceDecisions.put(
new Pair<>("manifestar", "manifestarme su solidaridad"),
PersonalPronounType.OBJECT);
bruteForceDecisions.put(
new Pair<>("manifestar", "manifestarnos sobre las circunstancias que mantienen en vilo la vida y obra de los colombianos"),
PersonalPronounType.REFLEXIVE);
bruteForceDecisions.put(
new Pair<>("mirando", "estábamos mirándonos"), PersonalPronounType.REFLEXIVE);
bruteForceDecisions.put(
new Pair<>("poner", "ponerme en ascuas"), PersonalPronounType.OBJECT);
bruteForceDecisions.put(
new Pair<>("servir", "servirme de guía"), PersonalPronounType.OBJECT);
bruteForceDecisions.put(
new Pair<>("volver", "debe volvernos"), PersonalPronounType.OBJECT);
bruteForceDecisions.put(
new Pair<>("volver", "deja de volverme"), PersonalPronounType.OBJECT);
bruteForceDecisions.put(
new Pair<>("volver", "volvernos"), PersonalPronounType.REFLEXIVE);
}
/**
* Determine if the given pronoun can have multiple senses.
*/
public static boolean isAmbiguous(String pronoun) {
return ambiguousPersonalPronouns.contains(pronoun);
}
/**
* Determine whether the given clitic pronoun is an indirect object
* pronoun or a reflexive pronoun.
*
* This method is only defined when the pronoun is one of
*
* me, te, se, nos, os
*
* i.e., those in which the meaning is actually ambiguous.
*
* @param strippedVerb Stripped verb as returned by
* {@link edu.stanford.nlp.international.spanish.SpanishVerbStripper#separatePronouns(String)}.
* @param pronounIdx The index of the pronoun within
* {@code strippedVerb.getPronouns()} which should be
* disambiguated.
* @param clauseYield A string representing the yield of the
* clause which contains the given verb
* @throws java.lang.IllegalArgumentException If the given pronoun is
* not ambiguous, or its disambiguation is not supported.
*/
public static PersonalPronounType disambiguatePersonalPronoun(SpanishVerbStripper.StrippedVerb strippedVerb,
int pronounIdx, String clauseYield) {
List<String> pronouns = strippedVerb.getPronouns();
String pronoun = pronouns.get(pronounIdx).toLowerCase();
if (!ambiguousPersonalPronouns.contains(pronoun))
throw new IllegalArgumentException("We don't support disambiguating pronoun '" + pronoun + "'");
if (pronouns.size() == 1 && pronoun.equalsIgnoreCase("se"))
return PersonalPronounType.REFLEXIVE;
String verb = strippedVerb.getStem();
if (alwaysReflexiveVerbs.contains(verb))
return PersonalPronounType.REFLEXIVE;
else if (neverReflexiveVerbs.contains(verb))
return PersonalPronounType.OBJECT;
Pair<String, String> bruteForceKey = new Pair<>(verb, clauseYield);
if (bruteForceDecisions.containsKey(bruteForceKey))
return bruteForceDecisions.get(bruteForceKey);
// Log this instance where a clitic pronoun could not be disambiguated.
log.info("Failed to disambiguate: " + verb
+ "\nContaining clause:\t" + clauseYield + "\n");
return PersonalPronounType.UNKNOWN;
}
}