package edu.stanford.nlp.parser.lexparser;
import edu.stanford.nlp.util.logging.Redwood;
import edu.stanford.nlp.ling.TaggedWord;
import edu.stanford.nlp.stats.ClassicCounter;
import edu.stanford.nlp.util.Index;
public class FrenchUnknownWordModelTrainer
extends AbstractUnknownWordModelTrainer {
private ClassicCounter<IntTaggedWord> seenCounter;
private ClassicCounter<IntTaggedWord> unSeenCounter;
private double indexToStartUnkCounting;
// boundary tag -- assumed not a real tag
private static final String BOUNDARY_TAG = ".$$.";
private UnknownWordModel model;
@Override
public void initializeTraining(Options op, Lexicon lex,
Index<String> wordIndex,
Index<String> tagIndex, double totalTrees) {
super.initializeTraining(op, lex, wordIndex, tagIndex, totalTrees);
indexToStartUnkCounting = (totalTrees * op.trainOptions.fractionBeforeUnseenCounting);
seenCounter = new ClassicCounter<>();
unSeenCounter = new ClassicCounter<>();
model = new FrenchUnknownWordModel(op, lex, wordIndex, tagIndex,
unSeenCounter);
}
/**
* Trains this lexicon on the Collection of trees.
*/
public void train(TaggedWord tw, int loc, double weight) {
IntTaggedWord iTW =
new IntTaggedWord(tw.word(), tw.tag(), wordIndex, tagIndex);
IntTaggedWord iT = new IntTaggedWord(nullWord, iTW.tag);
IntTaggedWord iW = new IntTaggedWord(iTW.word, nullTag);
seenCounter.incrementCount(iW, weight);
IntTaggedWord i = NULL_ITW;
if (treesRead > indexToStartUnkCounting) {
// start doing this once some way through trees;
// treesRead is 1 based counting
if (seenCounter.getCount(iW) < 2) {
// it's an entirely unknown word
int s = model.getSignatureIndex(iTW.word, loc,
wordIndex.get(iTW.word));
IntTaggedWord iTS = new IntTaggedWord(s, iTW.tag);
IntTaggedWord iS = new IntTaggedWord(s, nullTag);
unSeenCounter.incrementCount(iTS, weight);
unSeenCounter.incrementCount(iT, weight);
unSeenCounter.incrementCount(iS, weight);
unSeenCounter.incrementCount(i, weight);
}
}
}
public UnknownWordModel finishTraining() {
// make sure the unseen counter isn't empty! If it is, put in
// a uniform unseen over tags
if (unSeenCounter.isEmpty()) {
System.err.printf("%s: WARNING: Unseen word counter is empty!",
this.getClass().getName());
int numTags = tagIndex.size();
for (int tt = 0; tt < numTags; tt++) {
if ( ! BOUNDARY_TAG.equals(tagIndex.get(tt))) {
IntTaggedWord iT = new IntTaggedWord(nullWord, tt);
IntTaggedWord i = NULL_ITW;
unSeenCounter.incrementCount(iT);
unSeenCounter.incrementCount(i);
}
}
}
return model;
}
}