package edu.stanford.nlp.international.french;
import java.util.regex.Pattern;
/**
* Contains patterns for matching certain word types in French, such
* as common suffices for nouns, verbs, adjectives and adverbs.
*/
public class FrenchUnknownWordSignatures {
private static final Pattern pNounSuffix = Pattern.compile("(?:ier|ière|ité|ion|ison|isme|ysme|iste|esse|eur|euse|ence|eau|erie|ng|ette|age|ade|ance|ude|ogue|aphe|ate|duc|anthe|archie|coque|érèse|ergie|ogie|lithe|mètre|métrie|odie|pathie|phie|phone|phore|onyme|thèque|scope|some|pole|ôme|chromie|pie)s?$");
private static final Pattern pAdjSuffix = Pattern.compile("(?:iste|ième|uple|issime|aire|esque|atoire|ale|al|able|ible|atif|ique|if|ive|eux|aise|ent|ois|oise|ante|el|elle|ente|oire|ain|aine)s?$");
private static final Pattern pHasDigit = Pattern.compile("\\d+");
private static final Pattern pIsDigit = Pattern.compile("^\\d+$");
private static final Pattern pPosPlural = Pattern.compile("(?:s|ux)$");
private static final Pattern pVerbSuffix = Pattern.compile("(?:ir|er|re|ez|ont|ent|ant|ais|ait|ra|era|eras|é|és|ées|isse|it)$");
private static final Pattern pAdvSuffix = Pattern.compile("(?:iment|ement|emment|amment)$");
private static final Pattern pHasPunc = Pattern.compile("(?:[\u0021-\u002F\u003A-\u0040\\u005B-\u0060\u007B-\u007E\u00A1-\u00BF\u00F7\u2010-\u2027\u2030-\u205E\u20A0-\u20BA])+");
private static final Pattern pIsPunc = Pattern.compile("^(?:[\u0021-\u002F\u003A-\u0040\\u005B-\u0060\u007B-\u007E\u00A1-\u00BF\u00F7\u2010-\u2027\u2030-\u205E\u20A0-\u20BA])+$");
private static final Pattern pAllCaps = Pattern.compile("^[A-Z\u00C0-\u00D6\u00D8-\u00DE]+$");
private FrenchUnknownWordSignatures() {} // static methods
public static boolean hasNounSuffix(String s) {
return pNounSuffix.matcher(s).find();
}
public static String nounSuffix(String s) {
return hasNounSuffix(s) ? "-noun" : "";
}
public static boolean hasAdjSuffix(String s) {
return pAdjSuffix.matcher(s).find();
}
public static String adjSuffix(String s) {
return hasAdjSuffix(s) ? "-adj" : "";
}
public static String hasDigit(String s) {
return pHasDigit.matcher(s).find() ? "-num" : "";
}
public static String isDigit(String s) {
return pIsDigit.matcher(s).find() ? "-isNum" : "";
}
public static boolean hasVerbSuffix(String s) {
return pVerbSuffix.matcher(s).find();
}
public static String verbSuffix(String s) {
return hasVerbSuffix(s) ? "-verb" : "";
}
public static boolean hasPossiblePlural(String s) {
return pPosPlural.matcher(s).find();
}
public static String possiblePlural(String s) {
return hasPossiblePlural(s) ? "-plural" : "";
}
public static boolean hasAdvSuffix(String s) {
return pAdvSuffix.matcher(s).find();
}
public static String advSuffix(String s) {
return hasAdvSuffix(s) ? "-adv" : "";
}
public static String hasPunc(String s) {
return pHasPunc.matcher(s).find() ? "-hpunc" : "";
}
public static String isPunc(String s) {
return pIsPunc.matcher(s).matches() ? "-ipunc" : "";
}
public static String isAllCaps(String s) {
return pAllCaps.matcher(s).matches() ? "-allcap" : "";
}
public static String isCapitalized(String s) {
if(s.length() > 0) {
Character ch = s.charAt(0);
return Character.isUpperCase(ch) ? "-upper" : "";
}
return "";
}
}