package edu.stanford.nlp.international.spanish;
import java.util.regex.Pattern;
/**
* Contains patterns for matching certain word types in Spanish, such
* as common suffices for nouns, verbs, adjectives and adverbs.
*
* These utilities are used to characterize unknown words within the
* POS tagger and the parser.
*
* @see edu.stanford.nlp.tagger.maxent.ExtractorFramesRare
* @see SpanishUnknownWordModel
*
* @author Jon Gauthier
*/
public class SpanishUnknownWordSignatures {
private static final Pattern pMasculine = Pattern.compile("os?$");
private static final Pattern pFeminine = Pattern.compile("as?$");
// The following patterns help to distinguish between verbs in the
// conditional tense and -er, -ir verbs in the indicative imperfect.
// Words in these two forms have matching suffixes and are otherwise
// difficult to distinguish.
private static final Pattern pConditionalSuffix = Pattern.compile("[aei]ría(?:s|mos|is|n)?$");
private static final Pattern pImperfectErIrSuffix = Pattern.compile("[^r]ía(?:s|mos|is|n)?$");
private static final Pattern pImperfect = Pattern.compile(
"(?:aba(?:[sn]|is)?|ábamos|[^r]ía(?:s|mos|is|n)?)$");
private static final Pattern pInfinitive = Pattern.compile("[aei]r$");
private static final Pattern pAdverb = Pattern.compile("mente$");
// Most of the words disguised as first-person plural verb forms have
// contrastive stress.. yay, easy to match!
private static final Pattern pVerbFirstPersonPlural = Pattern.compile(
"(?<!últ|máx|mín|án|próx|ís|cént|[np]ón|prést|gít|ínt|pár" +
"|^extr|^supr|^tr?|^[Rr]?|gr)[eia]mos$");
private static final Pattern pGerund = Pattern.compile(
"(?i)((?<!^([bmn]|bl|com|contrab|cu|[fh]ern))a" +
"|(?<!^(asci|ati|atu|compr|condesci|conti|desati|desci|desenti|disti|divid|enci|enti|estup" +
"|exti|fi|hi|malenti|pret|refer|rever|sobreenti|subti|ti|transci|trasci|trem))e)ndo$");
private SpanishUnknownWordSignatures() {} // static methods
public static boolean hasMasculineSuffix(String s) {
return pMasculine.matcher(s).find();
}
public static boolean hasFeminineSuffix(String s) {
return pFeminine.matcher(s).find();
}
public static boolean hasConditionalSuffix(String s) {
return pConditionalSuffix.matcher(s).find();
}
public static boolean hasImperfectErIrSuffix(String s) {
return pImperfectErIrSuffix.matcher(s).find();
}
public static boolean hasImperfectSuffix(String s) {
return pImperfect.matcher(s).find();
}
public static boolean hasInfinitiveSuffix(String s) {
return pInfinitive.matcher(s).find();
}
public static boolean hasAdverbSuffix(String s) {
return pAdverb.matcher(s).find();
}
public static boolean hasVerbFirstPersonPluralSuffix(String s) {
return pVerbFirstPersonPlural.matcher(s).find();
}
public static boolean hasGerundSuffix(String s) {
return pGerund.matcher(s).find();
}
// The *Suffix methods are used by the SpanishUnknownWordModel to
// build a representation of an unknown word.
public static String conditionalSuffix(String s) {
return hasConditionalSuffix(s) ? "-cond" : "";
}
public static String imperfectSuffix(String s) {
return hasImperfectSuffix(s) ? "-imp" : "";
}
public static String infinitiveSuffix(String s) {
return hasInfinitiveSuffix(s) ? "-inf" : "";
}
public static String adverbSuffix(String s) {
return hasAdverbSuffix(s) ? "-adv" : "";
}
}