/*******************************************************************************
* Copyright (c) 2012 György Orosz, Attila Novák.
* All rights reserved. This program and the accompanying materials
* are made available under the terms of the GNU Lesser Public License v3
* which accompanies this distribution, and is available at
* http://www.gnu.org/licenses/
*
* This file is part of PurePos.
*
* PurePos is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* PurePos is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser Public License for more details.
*
* Contributors:
* György Orosz - initial API and implementation
******************************************************************************/
package hu.ppke.itk.nlpg.purepos.model.internal;
import hu.ppke.itk.nlpg.docmodel.IDocument;
import hu.ppke.itk.nlpg.docmodel.ISentence;
import hu.ppke.itk.nlpg.docmodel.IToken;
import hu.ppke.itk.nlpg.docmodel.internal.Sentence;
import hu.ppke.itk.nlpg.docmodel.internal.Token;
import hu.ppke.itk.nlpg.purepos.cli.configuration.Configuration;
import hu.ppke.itk.nlpg.purepos.common.SpecTokenMatcher;
import hu.ppke.itk.nlpg.purepos.common.Statistics;
import hu.ppke.itk.nlpg.purepos.common.Util;
import hu.ppke.itk.nlpg.purepos.common.lemma.LemmaUtil;
import hu.ppke.itk.nlpg.purepos.model.ISpecTokenMatcher;
import hu.ppke.itk.nlpg.purepos.model.Model;
import hu.ppke.itk.nlpg.purepos.model.ModelData;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map.Entry;
import java.util.Vector;
public class RawModel extends Model<String, Integer> {
@SuppressWarnings("unused")
private RawModel() {
this(2, 2, 10, 10);
}
private static final long serialVersionUID = 8860320542881381547L;
protected RawModelData rawModeldata;
public Statistics getLastStat() {
return rawModeldata.stat;
}
// public List<Double> getLemmaLambdas() {
// return rawModeldata.lemmaLambdas;
// }
// @Deprecated
// private RawModel(int taggingOrder, int emissionOrder, int suffixLength,
// int rareFrequency, ILexicon<String, Integer> standardTokensLexicon,
// ILexicon<String, Integer> specTokensLexicon,
// IVocabulary<String, Integer> tagVocabulary) {
// super(taggingOrder, emissionOrder, suffixLength, rareFrequency,
// standardTokensLexicon, specTokensLexicon, tagVocabulary);
//
// rawModeldata = new RawModelData();
// rawModeldata.stat = new Statistics();
//
// rawModeldata.tagNGramModel = new NGramModel<Integer>(taggingOrder + 1);
// rawModeldata.stdEmissionNGramModel = new NGramModel<String>(
// emissionOrder + 1);
// rawModeldata.specEmissionNGramModel = new NGramModel<String>(2);
// rawModeldata.lemmaTree = new HashLemmaTree(100);
// rawModeldata.lemmaUnigramModel = new LemmaUnigramModel<String>();
// rawModeldata.combiner = new LogLinearCombiner();
//
// }
public RawModel(int taggingOrder, int emissionOrder, int suffixLength,
int rareFrequency) {
this(ModelData.create(taggingOrder, emissionOrder, suffixLength,
rareFrequency));
}
public RawModel(ModelData<String, Integer> modelData) {
super(modelData);
this.rawModeldata = new RawModelData(modelData.taggingOrder,
modelData.emissionOrder);
}
public void train(IDocument document) {
this.rawModeldata.eosTag = data.tagVocabulary.addElement(getEOSTag());
for (ISentence sentence : document.getSentences()) {
ISentence mySentence = new Sentence(sentence);
addSentenceMarkers(mySentence);
addSentence(mySentence);
}
buildSuffixTrees();
rawModeldata.combiner.calculateParameters(document, rawModeldata, data);
}
protected void addSentenceMarkers(ISentence mySentence) {
mySentence.add(0, new Token(ModelData.BOS_TOKEN, ModelData.BOS_TAG));
}
protected void addSentence(ISentence sentence) {
rawModeldata.stat.incrementSentenceCount();
ISpecTokenMatcher specMatcher = new SpecTokenMatcher();
Vector<Integer> tags = new Vector<Integer>();
for (int j = sentence.size() - 1; j >= 0; --j) {
Integer tagID = data.tagVocabulary.addElement(sentence.get(j)
.getTag());
tags.add(tagID);
}
Collections.reverse(tags);
// add EOS tag to the model
rawModeldata.tagNGramModel.addWord(tags, rawModeldata.eosTag);
for (int i = sentence.size() - 1; i >= 0; --i) {
IToken token = sentence.get(i);
if (!token.getToken().equals(ModelData.BOS_TOKEN)) {
token = Util.simplifyLemma(token);
}
String word = token.getToken();
String lemma = token.getStem();
String tagStr = token.getTag();
Integer tag = tags.get(i);
// TEST: creating a trie from lemmas
List<Integer> context = tags.subList(0, i + 1);
List<Integer> prevTags = context.subList(0, context.size() - 1);
if (!(word.equals(Model.getBOSToken()) || word.equals(Model
.getEOSToken()))) {
LemmaUtil.storeLemma(word, lemma, tag, tagStr, rawModeldata);
rawModeldata.tagNGramModel.addWord(prevTags, tag);
rawModeldata.stat.incrementTokenCount();
data.standardTokensLexicon.addToken(word, tag);
rawModeldata.stdEmissionNGramModel.addWord(context, word);
String specName;
if ((specName = specMatcher.matchLexicalElement(word)) != null) {
rawModeldata.specEmissionNGramModel.addWord(context,
specName);
// this is how it should have been used:
data.specTokensLexicon.addToken(specName, tag);
// this is how it is used in HunPOS:
// specTokensLexicon.addToken(word, tag);
}
// else {
// standardTokensLexicon.addToken(word, tag);
// stdEmissionNGramModel.addWord(context, word);
// }
}
}
}
protected void buildSuffixTrees() {
// if the model is changed suffix trees need to be rebuilt
rawModeldata.lowerSuffixTree = new HashSuffixTree<Integer>(
data.suffixLength);
rawModeldata.upperSuffixTree = new HashSuffixTree<Integer>(
data.suffixLength);
for (Entry<String, HashMap<Integer, Integer>> entry : data.standardTokensLexicon) {
String word = entry.getKey();
int wordFreq = data.standardTokensLexicon.getWordCount(word);
if (wordFreq <= data.rareFreqency) {
String lowerWord = Util.toLower(word);
boolean isLower = !Util.isUpper(lowerWord, word);
for (Integer tag : entry.getValue().keySet()) {
int wordTagFreq = data.standardTokensLexicon
.getWordCountForTag(word, tag);
if (isLower) {
rawModeldata.lowerSuffixTree.addWord(lowerWord, tag,
wordTagFreq);
rawModeldata.stat
.incrementLowerGuesserItems(wordTagFreq);
} else {
rawModeldata.upperSuffixTree.addWord(lowerWord, tag,
wordTagFreq);
rawModeldata.stat
.incrementUpperGuesserItems(wordTagFreq);
}
}
}
}
}
protected Double smooth(Double val) {
if (val == null)
return Util.UNKOWN_VALUE;
return val;
}
// public CompiledModel<String, Integer> compile(Configuration conf) {
// data.tagVocabulary.storeMaximalElement();
// IProbabilityModel<Integer, Integer> tagTransitionModel =
// rawModeldata.tagNGramModel
// .createProbabilityModel();
// IProbabilityModel<Integer, String> standardEmissionModel =
// rawModeldata.stdEmissionNGramModel
// .createProbabilityModel();
// IProbabilityModel<Integer, String> specTokensEmissionModel =
// rawModeldata.specEmissionNGramModel
// .createProbabilityModel();
// Map<Integer, Double> aprioriProbs = rawModeldata.tagNGramModel
// .getWordAprioriProbs();
// Double theta = SuffixTree.calculateTheta(aprioriProbs);
// ISuffixGuesser<String, Integer> lowerCaseSuffixGuesser =
// rawModeldata.lowerSuffixTree
// .createGuesser(theta);
// ISuffixGuesser<String, Integer> upperCaseSuffixGuesser =
// rawModeldata.upperSuffixTree
// .createGuesser(theta);
// ISuffixGuesser<String, Pair<String, Integer>> lemmaSuffixGuesser =
// rawModeldata.lemmaTree
// .createGuesser(theta);
//
// addMappings(standardEmissionModel, specTokensEmissionModel,
// tagTransitionModel, lowerCaseSuffixGuesser,
// upperCaseSuffixGuesser, data.tagVocabulary,
// conf.getTagMappings());
//
// CompiledModel<String, Integer> model = new CompiledModel<String,
// Integer>(
// data.taggingOrder, data.emissionOrder, data.suffixLength,
// data.rareFreqency, tagTransitionModel, standardEmissionModel,
// specTokensEmissionModel, lowerCaseSuffixGuesser,
// upperCaseSuffixGuesser, lemmaSuffixGuesser,
// rawModeldata.lemmaUnigramModel, data.standardTokensLexicon,
// data.specTokensLexicon, data.tagVocabulary, aprioriProbs,
// rawModeldata.lemmaLambdas);
// return model;
//
// }
public CompiledModel<String, Integer> compile(Configuration conf) {
data.tagVocabulary.storeMaximalElement();
CompiledModelData<String, Integer> compiledModelData = RawModelData
.compile(this.rawModeldata);
Util.addMappings(compiledModelData, data.tagVocabulary,
conf.getTagMappings());
return new CompiledModel<String, Integer>(compiledModelData, this.data);
}
}