package edu.stanford.nlp.parser.lexparser; import edu.stanford.nlp.util.logging.Redwood; import edu.stanford.nlp.international.french.FrenchUnknownWordSignatures; import edu.stanford.nlp.stats.ClassicCounter; import edu.stanford.nlp.util.Index; public class FrenchUnknownWordModel extends BaseUnknownWordModel { /** A logger for this class */ private static Redwood.RedwoodChannels log = Redwood.channels(FrenchUnknownWordModel.class); private static final long serialVersionUID = -776564693549194424L; protected final boolean smartMutation; protected final int unknownSuffixSize; protected final int unknownPrefixSize; public FrenchUnknownWordModel(Options op, Lexicon lex, Index<String> wordIndex, Index<String> tagIndex, ClassicCounter<IntTaggedWord> unSeenCounter) { super(op, lex, wordIndex, tagIndex, unSeenCounter, null, null, null); this.smartMutation = op.lexOptions.smartMutation; this.unknownSuffixSize = op.lexOptions.unknownSuffixSize; this.unknownPrefixSize = op.lexOptions.unknownPrefixSize; } /** * This constructor creates an UWM with empty data structures. Only * use if loading in the data separately, such as by reading in text * lines containing the data. */ public FrenchUnknownWordModel(Options op, Lexicon lex, Index<String> wordIndex, Index<String> tagIndex) { this(op, lex, wordIndex, tagIndex, new ClassicCounter<>()); } @Override public float score(IntTaggedWord iTW, int loc, double c_Tseen, double total, double smooth, String word) { double pb_W_T; // always set below // unknown word model for P(T|S) int wordSig = getSignatureIndex(iTW.word, loc, word); IntTaggedWord temp = new IntTaggedWord(wordSig, iTW.tag); double c_TS = unSeenCounter.getCount(temp); temp = new IntTaggedWord(wordSig, nullTag); double c_S = unSeenCounter.getCount(temp); double c_U = unSeenCounter.getCount(NULL_ITW); temp = new IntTaggedWord(nullWord, iTW.tag); double c_T = unSeenCounter.getCount(temp); double p_T_U = c_T / c_U; if (unknownLevel == 0) { c_TS = 0; c_S = 0; } double pb_T_S = (c_TS + smooth * p_T_U) / (c_S + smooth); double p_T = (c_Tseen / total); double p_W = 1.0 / total; pb_W_T = Math.log(pb_T_S * p_W / p_T); return (float) pb_W_T; } /** * Returns the index of the signature of the word numbered wordIndex, where * the signature is the String representation of unknown word features. */ @Override public int getSignatureIndex(int index, int sentencePosition, String word) { String uwSig = getSignature(word, sentencePosition); int sig = wordIndex.addToIndex(uwSig); return sig; } /** * TODO Can add various signatures, setting the signature via Options. * * @param word The word to make a signature for * @param loc Its position in the sentence (mainly so sentence-initial * capitalized words can be treated differently) * @return A String that is its signature (equivalence class) */ @Override public String getSignature(String word, int loc) { final String BASE_LABEL = "UNK"; StringBuilder sb = new StringBuilder(BASE_LABEL); switch (unknownLevel) { case 1: //Marie's initial attempt sb.append(FrenchUnknownWordSignatures.nounSuffix(word)); if(sb.toString().equals(BASE_LABEL)) { sb.append(FrenchUnknownWordSignatures.adjSuffix(word)); if(sb.toString().equals(BASE_LABEL)) { sb.append(FrenchUnknownWordSignatures.verbSuffix(word)); if(sb.toString().equals(BASE_LABEL)) { sb.append(FrenchUnknownWordSignatures.advSuffix(word)); } } } sb.append(FrenchUnknownWordSignatures.possiblePlural(word)); String hasDigit = FrenchUnknownWordSignatures.hasDigit(word); String isDigit = FrenchUnknownWordSignatures.isDigit(word); if( ! hasDigit.equals("")) { if(isDigit.equals("")) { sb.append(hasDigit); } else { sb.append(isDigit); } } // if(FrenchUnknownWordSignatures.isPunc(word).equals("")) sb.append(FrenchUnknownWordSignatures.hasPunc(word)); // else // sb.append(FrenchUnknownWordSignatures.isPunc(word)); sb.append(FrenchUnknownWordSignatures.isAllCaps(word)); if(loc > 0) { if(FrenchUnknownWordSignatures.isAllCaps(word).equals("")) sb.append(FrenchUnknownWordSignatures.isCapitalized(word)); } //Backoff to suffix if we haven't matched anything else if(unknownSuffixSize > 0 && sb.toString().equals(BASE_LABEL)) { int min = word.length() < unknownSuffixSize ? word.length(): unknownSuffixSize; sb.append('-').append(word.substring(word.length() - min)); } break; default: System.err.printf("%s: Invalid unknown word signature! (%d)%n", this.getClass().getName(),unknownLevel); } return sb.toString(); } }