package edu.stanford.nlp.parser.lexparser;
import edu.stanford.nlp.util.logging.Redwood;
import edu.stanford.nlp.stats.ClassicCounter;
import edu.stanford.nlp.util.Index;
/**
* This is a basic unknown word model for Arabic. It supports 4 different
* types of feature modeling; see {@link #getSignature(String, int)}.
*
* <i>Implementation note: the contents of this class tend to overlap somewhat
* with {@link EnglishUnknownWordModel} and were originally included in {@link BaseLexicon}.
*
* @author Roger Levy
* @author Christopher Manning
* @author Anna Rafferty
*/
public class ArabicUnknownWordModel extends BaseUnknownWordModel {
/** A logger for this class */
private static Redwood.RedwoodChannels log = Redwood.channels(ArabicUnknownWordModel.class);
private static final long serialVersionUID = 4825624957364628771L;
private static final int MIN_UNKNOWN = 6;
private static final int MAX_UNKNOWN = 10;
protected final boolean smartMutation;
protected final int unknownSuffixSize;
protected final int unknownPrefixSize;
public ArabicUnknownWordModel(Options op, Lexicon lex,
Index<String> wordIndex,
Index<String> tagIndex,
ClassicCounter<IntTaggedWord> unSeenCounter) {
super(op, lex, wordIndex, tagIndex, unSeenCounter, null, null, null);
if (unknownLevel < MIN_UNKNOWN || unknownLevel > MAX_UNKNOWN) {
throw new IllegalArgumentException("Invalid value for useUnknownWordSignatures: " + unknownLevel);
}
this.smartMutation = op.lexOptions.smartMutation;
this.unknownSuffixSize = op.lexOptions.unknownSuffixSize;
this.unknownPrefixSize = op.lexOptions.unknownPrefixSize;
}
/**
* This constructor creates an UWM with empty data structures. Only
* use if loading in the data separately, such as by reading in text
* lines containing the data.
*/
public ArabicUnknownWordModel(Options op, Lexicon lex,
Index<String> wordIndex,
Index<String> tagIndex) {
this(op, lex, wordIndex, tagIndex, new ClassicCounter<>());
}
@Override
public float score(IntTaggedWord iTW, int loc, double c_Tseen, double total, double smooth, String word) {
double pb_W_T; // always set below
// unknown word model for P(T|S)
int wordSig = getSignatureIndex(iTW.word, loc, word);
IntTaggedWord temp = new IntTaggedWord(wordSig, iTW.tag);
double c_TS = unSeenCounter.getCount(temp);
temp = new IntTaggedWord(wordSig, nullTag);
double c_S = unSeenCounter.getCount(temp);
double c_U = unSeenCounter.getCount(NULL_ITW);
temp = new IntTaggedWord(nullWord, iTW.tag);
double c_T = unSeenCounter.getCount(temp);
double p_T_U = c_T / c_U;
if (unknownLevel == 0) {
c_TS = 0;
c_S = 0;
}
double pb_T_S = (c_TS + smooth * p_T_U) / (c_S + smooth);
double p_T = (c_Tseen / total);
double p_W = 1.0 / total;
pb_W_T = Math.log(pb_T_S * p_W / p_T);
return (float) pb_W_T;
}
/**
* Returns the index of the signature of the word numbered wordIndex, where
* the signature is the String representation of unknown word features.
*/
@Override
public int getSignatureIndex(int index, int sentencePosition, String word) {
String uwSig = getSignature(word, sentencePosition);
int sig = wordIndex.addToIndex(uwSig);
return sig;
}
/**
* 6-9 were added for Arabic. 6 looks for the prefix Al- (and
* knows that Buckwalter uses various symbols as letters), while 7 just looks
* for numbers and last letter. 8 looks for Al-, looks for several useful
* suffixes, and tracks the first letter of the word. (note that the first
* letter seems a bit more informative than the last letter, overall.)
* 9 tries to build on 8, but avoiding some of its perceived flaws: really it
* was using the first AND last letter.
*
* @param word The word to make a signature for
* @param loc Its position in the sentence (mainly so sentence-initial
* capitalized words can be treated differently)
* @return A String that is its signature (equivalence class)
*/
@Override
public String getSignature(String word, int loc) {
StringBuilder sb = new StringBuilder("UNK");
switch (unknownLevel) {
case 10://Anna's attempt at improving Chris' attempt, April 2008
{
boolean allDigitPlus = ArabicUnknownWordSignatures.allDigitPlus(word);
int leng = word.length();
if (allDigitPlus) {
sb.append("-NUM");
} else if (word.startsWith("Al") || word.startsWith("\u0627\u0644")) {
sb.append("-Al");
} else {
// the first letters of a word seem more informative overall than the
// last letters.
// Alternatively we could add on the first two letters, if there's
// enough data.
if (unknownPrefixSize > 0) {
int min = leng < unknownPrefixSize ? leng: unknownPrefixSize;
sb.append('-').append(word.substring(0, min));
}
}
if(word.length() == 1) {
//add in the unicode type for the char
sb.append(Character.getType(word.charAt(0)));
}
sb.append(ArabicUnknownWordSignatures.likelyAdjectivalSuffix(word));
sb.append(ArabicUnknownWordSignatures.pastTenseVerbNumberSuffix(word));
sb.append(ArabicUnknownWordSignatures.presentTenseVerbNumberSuffix(word));
String ans = ArabicUnknownWordSignatures.abstractionNounSuffix(word);
if (! "".equals(ans)) {
sb.append(ans);
} else {
sb.append(ArabicUnknownWordSignatures.taaMarbuuTaSuffix(word));
}
if (unknownSuffixSize > 0 && ! allDigitPlus) {
int min = leng < unknownSuffixSize ? leng: unknownSuffixSize;
sb.append('-').append(word.substring(word.length() - min));
}
break;
}
case 9: // Chris' attempt at improving Roger's Arabic attempt, Nov 2006.
{
boolean allDigitPlus = ArabicUnknownWordSignatures.allDigitPlus(word);
int leng = word.length();
if (allDigitPlus) {
sb.append("-NUM");
} else if (word.startsWith("Al") || word.startsWith("\u0627\u0644")) {
sb.append("-Al");
} else {
// the first letters of a word seem more informative overall than the
// last letters.
// Alternatively we could add on the first two letters, if there's
// enough data.
if (unknownPrefixSize > 0) {
int min = leng < unknownPrefixSize ? leng: unknownPrefixSize;
sb.append('-').append(word.substring(0, min));
}
}
sb.append(ArabicUnknownWordSignatures.likelyAdjectivalSuffix(word));
sb.append(ArabicUnknownWordSignatures.pastTenseVerbNumberSuffix(word));
sb.append(ArabicUnknownWordSignatures.presentTenseVerbNumberSuffix(word));
String ans = ArabicUnknownWordSignatures.abstractionNounSuffix(word);
if (! "".equals(ans)) {
sb.append(ans);
} else {
sb.append(ArabicUnknownWordSignatures.taaMarbuuTaSuffix(word));
}
if (unknownSuffixSize > 0 && ! allDigitPlus) {
int min = leng < unknownSuffixSize ? leng: unknownSuffixSize;
sb.append('-').append(word.substring(word.length() - min));
}
break;
}
case 8: // Roger's attempt at an Arabic UWM, May 2006.
{
if (word.startsWith("Al")) {
sb.append("-Al");
}
boolean allDigitPlus = ArabicUnknownWordSignatures.allDigitPlus(word);
if (allDigitPlus) {
sb.append("-NUM");
} else {
// the first letters of a word seem more informative overall than the
// last letters.
// Alternatively we could add on the first two letters, if there's
// enough data.
sb.append('-').append(word.charAt(0));
}
sb.append(ArabicUnknownWordSignatures.likelyAdjectivalSuffix(word));
sb.append(ArabicUnknownWordSignatures.pastTenseVerbNumberSuffix(word));
sb.append(ArabicUnknownWordSignatures.presentTenseVerbNumberSuffix(word));
sb.append(ArabicUnknownWordSignatures.taaMarbuuTaSuffix(word));
sb.append(ArabicUnknownWordSignatures.abstractionNounSuffix(word));
break;
}
case 7: {
// For Arabic with Al's separated off (cdm, May 2006)
// { -NUM, -lastChar }
boolean allDigitPlus = ArabicUnknownWordSignatures.allDigitPlus(word);
if (allDigitPlus) {
sb.append("-NUM");
} else {
sb.append(word.charAt(word.length() - 1));
}
break;
}
case 6: {
// For Arabic (cdm, May 2006), with Al- as part of word
// { -Al, 0 } +
// { -NUM, -last char(s) }
if (word.startsWith("Al")) {
sb.append("-Al");
}
boolean allDigitPlus = ArabicUnknownWordSignatures.allDigitPlus(word);
if (allDigitPlus) {
sb.append("-NUM");
} else {
sb.append(word.charAt(word.length() - 1));
}
break;
}
default:
// 0 = do nothing so it just stays as "UNK"
} // end switch (unknownLevel)
// log.info("Summarized " + word + " to " + sb.toString());
return sb.toString();
} // end getSignature()
@Override
public int getUnknownLevel() {
return unknownLevel;
}
}