package edu.stanford.nlp.parser.lexparser; import java.util.regex.Pattern; /** * Unknown word signatures for the Arabic Treebank. * These handle unvocalized Arabic, in either Buckwalter or Unicode. * * @author Roger Levy (rog@csli.stanford.edu) * @author Christopher Manning (extended to handle UTF-8) */ class ArabicUnknownWordSignatures { private ArabicUnknownWordSignatures() { } static boolean allDigitPlus(String word) { boolean allDigitPlus = true; boolean seenDigit = false; for (int i = 0, wlen = word.length(); i < wlen; i++) { char ch = word.charAt(i); if (Character.isDigit(ch)) { seenDigit = true; } else if (ch == '-' || ch == '.' || ch == ',' || ch == '\u066B' || ch == '\u066C' || ch == '\u2212') { // U+066B = Arabic decimal separator // U+066C = Arabic thousands separator // U+2212 = Minus sign } else { allDigitPlus = false; } } return allDigitPlus && seenDigit; } /** nisba suffix for deriving adjectives: (i)yy(n) [masc] * or -(i)yya [fem]. Other adjectives are made in the binyanim system * by vowel changes. */ private static final Pattern adjectivalSuffixPattern = Pattern.compile("[y\u064A][y\u064A](?:[t\u062A]?[n\u0646])?$"); static String likelyAdjectivalSuffix(String word) { if (adjectivalSuffixPattern.matcher(word).find()) { return "-AdjSuffix"; } else { return ""; } } private static final Pattern singularPastTenseSuffixPattern = Pattern.compile("[t\u062A]$"); private static final Pattern pluralFirstPersonPastTenseSuffixPattern = Pattern.compile("[n\u0646][A\u0627]$"); private static final Pattern pluralThirdPersonMasculinePastTenseSuffixPattern = Pattern.compile("[w\u0648]$"); // could be used but doesn't seem very discriminating // private static final Pattern pluralThirdPersonFemininePastTenseSuffixPattern = Pattern.compile("[n\u0646]$"); // there doesn't seem to be second-person marking in the corpus, just first // and non-first (?) static String pastTenseVerbNumberSuffix(String word) { if (singularPastTenseSuffixPattern.matcher(word).find()) return "-PV.sg"; if (pluralFirstPersonPastTenseSuffixPattern.matcher(word).find()) return "-PV.pl1"; if (pluralThirdPersonMasculinePastTenseSuffixPattern.matcher(word).find()) return "-PV.pl3m"; return ""; } private static final Pattern pluralThirdPersonMasculinePresentTenseSuffixPattern = Pattern.compile("[w\u0648][\u0646n]$"); static String presentTenseVerbNumberSuffix(String word) { return pluralThirdPersonMasculinePresentTenseSuffixPattern.matcher(word).find() ? "-IV.pl3m" : ""; } private static final Pattern taaMarbuuTaSuffixPattern = Pattern.compile("[\u0629p]$"); // almost always ADJ or NOUN static String taaMarbuuTaSuffix(String word) { return taaMarbuuTaSuffixPattern.matcher(word).find() ? "-taaMarbuuTa" : ""; } // Roger wrote: "ironically, this seems to be a better indicator of ADJ than // of NOUN", but Chris thinks it may just have been a bug in his code private static final Pattern abstractionNounSuffixPattern = Pattern.compile("[y\u064a][p\u0629]$"); static String abstractionNounSuffix(String word) { return abstractionNounSuffixPattern.matcher(word).find() ? "-AbstractionSuffix" : ""; } private static final Pattern masdarPrefixPattern = Pattern.compile("^[t\u062A]"); static String masdarPrefix(String word) { return masdarPrefixPattern.matcher(word).find() ? "-maSdr" : ""; } }