package edu.stanford.nlp.parser.lexparser;
import edu.stanford.nlp.util.logging.Redwood;
import edu.stanford.nlp.international.spanish.SpanishUnknownWordSignatures;
import edu.stanford.nlp.stats.ClassicCounter;
import edu.stanford.nlp.util.Index;
import edu.stanford.nlp.util.StringUtils;
public class SpanishUnknownWordModel extends BaseUnknownWordModel {
/** A logger for this class */
private static final Redwood.RedwoodChannels log = Redwood.channels(SpanishUnknownWordModel.class);
protected final boolean smartMutation;
protected final int unknownSuffixSize;
protected final int unknownPrefixSize;
public SpanishUnknownWordModel(Options op, Lexicon lex,
Index<String> wordIndex,
Index<String> tagIndex,
ClassicCounter<IntTaggedWord> unSeenCounter) {
super(op, lex, wordIndex, tagIndex, unSeenCounter, null, null, null);
this.smartMutation = op.lexOptions.smartMutation;
this.unknownSuffixSize = op.lexOptions.unknownSuffixSize;
this.unknownPrefixSize = op.lexOptions.unknownPrefixSize;
}
/**
* This constructor creates an UWM with empty data structures. Only
* use if loading in the data separately, such as by reading in text
* lines containing the data.
*/
public SpanishUnknownWordModel(Options op, Lexicon lex,
Index<String> wordIndex,
Index<String> tagIndex) {
this(op, lex, wordIndex, tagIndex, new ClassicCounter<>());
}
@Override
public float score(IntTaggedWord iTW, int loc, double c_Tseen, double total, double smooth, String word) {
double pb_W_T; // always set below
// unknown word model for P(T|S)
int wordSig = getSignatureIndex(iTW.word, loc, word);
IntTaggedWord temp = new IntTaggedWord(wordSig, iTW.tag);
double c_TS = unSeenCounter.getCount(temp);
temp = new IntTaggedWord(wordSig, nullTag);
double c_S = unSeenCounter.getCount(temp);
double c_U = unSeenCounter.getCount(NULL_ITW);
temp = new IntTaggedWord(nullWord, iTW.tag);
double c_T = unSeenCounter.getCount(temp);
double p_T_U = c_T / c_U;
if (unknownLevel == 0) {
c_TS = 0;
c_S = 0;
}
double pb_T_S = (c_TS + smooth * p_T_U) / (c_S + smooth);
double p_T = (c_Tseen / total);
double p_W = 1.0 / total;
pb_W_T = Math.log(pb_T_S * p_W / p_T);
return (float) pb_W_T;
}
/**
* Returns the index of the signature of the word numbered wordIndex, where
* the signature is the String representation of unknown word features.
*/
@Override
public int getSignatureIndex(int index, int sentencePosition, String word) {
String uwSig = getSignature(word, sentencePosition);
int sig = wordIndex.addToIndex(uwSig);
return sig;
}
/**
* TODO Can add various signatures, setting the signature via Options.
*
* @param word The word to make a signature for
* @param loc Its position in the sentence (mainly so sentence-initial
* capitalized words can be treated differently)
* @return A String that is its signature (equivalence class)
*/
@Override
public String getSignature(String word, int loc) {
final String BASE_LABEL = "UNK";
StringBuilder sb = new StringBuilder(BASE_LABEL);
switch (unknownLevel) {
case 1:
if (StringUtils.isNumeric(word)) {
sb.append('#');
break;
} else if (StringUtils.isPunct(word)) {
sb.append('!');
break;
}
// Mutually exclusive patterns
sb.append(SpanishUnknownWordSignatures.conditionalSuffix(word));
sb.append(SpanishUnknownWordSignatures.imperfectSuffix(word));
sb.append(SpanishUnknownWordSignatures.infinitiveSuffix(word));
sb.append(SpanishUnknownWordSignatures.adverbSuffix(word));
// Broad coverage patterns -- only apply if we haven't yet matched at all
if (sb.toString().equals(BASE_LABEL)) {
if (SpanishUnknownWordSignatures.hasVerbFirstPersonPluralSuffix(word)) {
sb.append("-vb1p");
} else if (SpanishUnknownWordSignatures.hasGerundSuffix(word)) {
sb.append("-ger");
} else if (word.endsWith("s")) {
sb.append("-s");
}
}
// Backoff to suffix if we haven't matched anything else
if (unknownSuffixSize > 0 && sb.toString().equals(BASE_LABEL)) {
int min = word.length() < unknownSuffixSize ? word.length() : unknownSuffixSize;
sb.append('-').append(word.substring(word.length() - min));
}
char first = word.charAt(0);
if ((Character.isUpperCase(first) || Character.isTitleCase(first)) && !isUpperCase(word)) {
sb.append("-C");
} else {
sb.append("-c");
}
break;
default:
log.error(String.format("%s: Invalid unknown word signature! (%d)%n", this.getClass().getName(),unknownLevel));
}
return sb.toString();
}
private static boolean isUpperCase(String s) {
for (int i = 0; i < s.length(); i++) {
if (Character.isLowerCase(s.charAt(i)))
return false;
}
return true;
}
private static final long serialVersionUID = 5370429530690606644L;
}