package edu.stanford.nlp.parser.lexparser;
import java.util.Map;
import java.util.Set;
import edu.stanford.nlp.ling.Label;
import edu.stanford.nlp.stats.ClassicCounter;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.Index;
/**
* An unknown word model for German; relies on BaseUnknownWordModel plus number matching.
* An assumption of this model is that numbers (arabic digit sequences)
* are tagged CARD. This is correct for all of NEGRA/Tiger/TueBaDZ.
*
* @author Roger Levy
* @author Greg Donaker (corrections and modeling improvements)
* @author Christopher Manning (generalized and improved what Greg did)
*/
public class GermanUnknownWordModel extends BaseUnknownWordModel {
private static final long serialVersionUID = 221L;
private static final String numberMatch = "[0-9]+(?:\\.[0-9]*)";
public GermanUnknownWordModel(Options op, Lexicon lex,
Index<String> wordIndex,
Index<String> tagIndex,
ClassicCounter<IntTaggedWord> unSeenCounter,
Map<Label,ClassicCounter<String>> tagHash,
Map<String,Float> unknownGT,
Set<String> seenEnd) {
super(op, lex, wordIndex, tagIndex,
unSeenCounter, tagHash, unknownGT, seenEnd);
}
/**
* This constructor creates an UWM with empty data structures. Only
* use if loading in the data separately, such as by reading in text
* lines containing the data.
*/
public GermanUnknownWordModel(Options op, Lexicon lex,
Index<String> wordIndex,
Index<String> tagIndex) {
this(op, lex, wordIndex, tagIndex,
new ClassicCounter<>(),
Generics.<Label,ClassicCounter<String>>newHashMap(),
Generics.<String,Float>newHashMap(),
Generics.<String>newHashSet());
}
/** Calculate the log-prob score of a particular TaggedWord in the
* unknown word model.
*
* @param itw the tag->word production in IntTaggedWord form
* @return The log-prob score of a particular TaggedWord.
*/
@Override
public float score(IntTaggedWord itw, String word) {
String tag = itw.tagString(tagIndex);
if (word.matches(numberMatch)) {
//EncodingPrintWriter.out.println("Number match for " + word,encoding);
if (tag.equals("CARD")) {
return 0.0f;
} else {
//EncodingPrintWriter.out.println("Unknown word estimate for " + word + " as " + tag + ": " + logProb,encoding); //debugging
return Float.NEGATIVE_INFINITY;
}
} else {
return super.score(itw, word);
}
}
}